1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/SelectionDAG.h" 29 #include "llvm/CodeGen/SelectionDAGNodes.h" 30 #include "llvm/CodeGen/TargetCallingConv.h" 31 #include "llvm/CodeGen/TargetLowering.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/Argument.h" 34 #include "llvm/IR/Attributes.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/GlobalValue.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/IntrinsicsNVPTX.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CodeGen.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/MachineValueType.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <iterator> 59 #include <sstream> 60 #include <string> 61 #include <utility> 62 #include <vector> 63 64 #define DEBUG_TYPE "nvptx-lower" 65 66 using namespace llvm; 67 68 static std::atomic<unsigned> GlobalUniqueCallSite; 69 70 static cl::opt<bool> sched4reg( 71 "nvptx-sched4reg", 72 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 73 74 static cl::opt<unsigned> 75 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, 76 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 77 " 1: do it 2: do it aggressively"), 78 cl::init(2)); 79 80 static cl::opt<int> UsePrecDivF32( 81 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, 82 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 83 " IEEE Compliant F32 div.rnd if available."), 84 cl::init(2)); 85 86 static cl::opt<bool> UsePrecSqrtF32( 87 "nvptx-prec-sqrtf32", cl::Hidden, 88 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 89 cl::init(true)); 90 91 int NVPTXTargetLowering::getDivF32Level() const { 92 if (UsePrecDivF32.getNumOccurrences() > 0) { 93 // If nvptx-prec-div32=N is used on the command-line, always honor it 94 return UsePrecDivF32; 95 } else { 96 // Otherwise, use div.approx if fast math is enabled 97 if (getTargetMachine().Options.UnsafeFPMath) 98 return 0; 99 else 100 return 2; 101 } 102 } 103 104 bool NVPTXTargetLowering::usePrecSqrtF32() const { 105 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 106 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 107 return UsePrecSqrtF32; 108 } else { 109 // Otherwise, use sqrt.approx if fast math is enabled 110 return !getTargetMachine().Options.UnsafeFPMath; 111 } 112 } 113 114 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 115 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 116 DenormalMode::PreserveSign; 117 } 118 119 static bool IsPTXVectorType(MVT VT) { 120 switch (VT.SimpleTy) { 121 default: 122 return false; 123 case MVT::v2i1: 124 case MVT::v4i1: 125 case MVT::v2i8: 126 case MVT::v4i8: 127 case MVT::v2i16: 128 case MVT::v4i16: 129 case MVT::v2i32: 130 case MVT::v4i32: 131 case MVT::v2i64: 132 case MVT::v2f16: 133 case MVT::v4f16: 134 case MVT::v8f16: // <4 x f16x2> 135 case MVT::v2f32: 136 case MVT::v4f32: 137 case MVT::v2f64: 138 return true; 139 } 140 } 141 142 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 143 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 144 /// into their primitive components. 145 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 146 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 147 /// LowerCall, and LowerReturn. 148 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 149 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 150 SmallVectorImpl<uint64_t> *Offsets = nullptr, 151 uint64_t StartingOffset = 0) { 152 SmallVector<EVT, 16> TempVTs; 153 SmallVector<uint64_t, 16> TempOffsets; 154 155 // Special case for i128 - decompose to (i64, i64) 156 if (Ty->isIntegerTy(128)) { 157 ValueVTs.push_back(EVT(MVT::i64)); 158 ValueVTs.push_back(EVT(MVT::i64)); 159 160 if (Offsets) { 161 Offsets->push_back(StartingOffset + 0); 162 Offsets->push_back(StartingOffset + 8); 163 } 164 165 return; 166 } 167 168 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 169 if (StructType *STy = dyn_cast<StructType>(Ty)) { 170 auto const *SL = DL.getStructLayout(STy); 171 auto ElementNum = 0; 172 for(auto *EI : STy->elements()) { 173 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 174 StartingOffset + SL->getElementOffset(ElementNum)); 175 ++ElementNum; 176 } 177 return; 178 } 179 180 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 181 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 182 EVT VT = TempVTs[i]; 183 uint64_t Off = TempOffsets[i]; 184 // Split vectors into individual elements, except for v2f16, which 185 // we will pass as a single scalar. 186 if (VT.isVector()) { 187 unsigned NumElts = VT.getVectorNumElements(); 188 EVT EltVT = VT.getVectorElementType(); 189 // Vectors with an even number of f16 elements will be passed to 190 // us as an array of v2f16 elements. We must match this so we 191 // stay in sync with Ins/Outs. 192 if (EltVT == MVT::f16 && NumElts % 2 == 0) { 193 EltVT = MVT::v2f16; 194 NumElts /= 2; 195 } 196 for (unsigned j = 0; j != NumElts; ++j) { 197 ValueVTs.push_back(EltVT); 198 if (Offsets) 199 Offsets->push_back(Off + j * EltVT.getStoreSize()); 200 } 201 } else { 202 ValueVTs.push_back(VT); 203 if (Offsets) 204 Offsets->push_back(Off); 205 } 206 } 207 } 208 209 // Check whether we can merge loads/stores of some of the pieces of a 210 // flattened function parameter or return value into a single vector 211 // load/store. 212 // 213 // The flattened parameter is represented as a list of EVTs and 214 // offsets, and the whole structure is aligned to ParamAlignment. This 215 // function determines whether we can load/store pieces of the 216 // parameter starting at index Idx using a single vectorized op of 217 // size AccessSize. If so, it returns the number of param pieces 218 // covered by the vector op. Otherwise, it returns 1. 219 static unsigned CanMergeParamLoadStoresStartingAt( 220 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 221 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 222 223 // Can't vectorize if param alignment is not sufficient. 224 if (ParamAlignment < AccessSize) 225 return 1; 226 // Can't vectorize if offset is not aligned. 227 if (Offsets[Idx] & (AccessSize - 1)) 228 return 1; 229 230 EVT EltVT = ValueVTs[Idx]; 231 unsigned EltSize = EltVT.getStoreSize(); 232 233 // Element is too large to vectorize. 234 if (EltSize >= AccessSize) 235 return 1; 236 237 unsigned NumElts = AccessSize / EltSize; 238 // Can't vectorize if AccessBytes if not a multiple of EltSize. 239 if (AccessSize != EltSize * NumElts) 240 return 1; 241 242 // We don't have enough elements to vectorize. 243 if (Idx + NumElts > ValueVTs.size()) 244 return 1; 245 246 // PTX ISA can only deal with 2- and 4-element vector ops. 247 if (NumElts != 4 && NumElts != 2) 248 return 1; 249 250 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 251 // Types do not match. 252 if (ValueVTs[j] != EltVT) 253 return 1; 254 255 // Elements are not contiguous. 256 if (Offsets[j] - Offsets[j - 1] != EltSize) 257 return 1; 258 } 259 // OK. We can vectorize ValueVTs[i..i+NumElts) 260 return NumElts; 261 } 262 263 // Flags for tracking per-element vectorization state of loads/stores 264 // of a flattened function parameter or return value. 265 enum ParamVectorizationFlags { 266 PVF_INNER = 0x0, // Middle elements of a vector. 267 PVF_FIRST = 0x1, // First element of the vector. 268 PVF_LAST = 0x2, // Last element of the vector. 269 // Scalar is effectively a 1-element vector. 270 PVF_SCALAR = PVF_FIRST | PVF_LAST 271 }; 272 273 // Computes whether and how we can vectorize the loads/stores of a 274 // flattened function parameter or return value. 275 // 276 // The flattened parameter is represented as the list of ValueVTs and 277 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 278 // of the same size as ValueVTs indicating how each piece should be 279 // loaded/stored (i.e. as a scalar, or as part of a vector 280 // load/store). 281 static SmallVector<ParamVectorizationFlags, 16> 282 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 283 const SmallVectorImpl<uint64_t> &Offsets, 284 Align ParamAlignment) { 285 // Set vector size to match ValueVTs and mark all elements as 286 // scalars by default. 287 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 288 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 289 290 // Check what we can vectorize using 128/64/32-bit accesses. 291 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 292 // Skip elements we've already processed. 293 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 294 for (unsigned AccessSize : {16, 8, 4, 2}) { 295 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 296 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 297 // Mark vectorized elements. 298 switch (NumElts) { 299 default: 300 llvm_unreachable("Unexpected return value"); 301 case 1: 302 // Can't vectorize using this size, try next smaller size. 303 continue; 304 case 2: 305 assert(I + 1 < E && "Not enough elements."); 306 VectorInfo[I] = PVF_FIRST; 307 VectorInfo[I + 1] = PVF_LAST; 308 I += 1; 309 break; 310 case 4: 311 assert(I + 3 < E && "Not enough elements."); 312 VectorInfo[I] = PVF_FIRST; 313 VectorInfo[I + 1] = PVF_INNER; 314 VectorInfo[I + 2] = PVF_INNER; 315 VectorInfo[I + 3] = PVF_LAST; 316 I += 3; 317 break; 318 } 319 // Break out of the inner loop because we've already succeeded 320 // using largest possible AccessSize. 321 break; 322 } 323 } 324 return VectorInfo; 325 } 326 327 // NVPTXTargetLowering Constructor. 328 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 329 const NVPTXSubtarget &STI) 330 : TargetLowering(TM), nvTM(&TM), STI(STI) { 331 // always lower memset, memcpy, and memmove intrinsics to load/store 332 // instructions, rather 333 // then generating calls to memset, mempcy or memmove. 334 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 335 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 336 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 337 338 setBooleanContents(ZeroOrNegativeOneBooleanContent); 339 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 340 341 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 342 // condition branches. 343 setJumpIsExpensive(true); 344 345 // Wide divides are _very_ slow. Try to reduce the width of the divide if 346 // possible. 347 addBypassSlowDiv(64, 32); 348 349 // By default, use the Source scheduling 350 if (sched4reg) 351 setSchedulingPreference(Sched::RegPressure); 352 else 353 setSchedulingPreference(Sched::Source); 354 355 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 356 LegalizeAction NoF16Action) { 357 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 358 }; 359 360 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 361 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 362 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 363 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 364 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 365 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 366 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 367 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 368 369 // Conversion to/from FP16/FP16x2 is always legal. 370 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 371 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 372 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 373 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 374 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 375 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 376 377 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 378 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 379 380 // Operations not directly supported by NVPTX. 381 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 382 MVT::i16, MVT::i32, MVT::i64}) { 383 setOperationAction(ISD::SELECT_CC, VT, Expand); 384 setOperationAction(ISD::BR_CC, VT, Expand); 385 } 386 387 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 388 // For others we will expand to a SHL/SRA pair. 389 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 390 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 391 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 392 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 393 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 394 395 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 396 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 397 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 398 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 399 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 400 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 401 402 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 403 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 404 405 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 406 // that don't have h/w rotation we lower them to multi-instruction assembly. 407 // See ROT*_sw in NVPTXIntrInfo.td 408 setOperationAction(ISD::ROTL, MVT::i64, Legal); 409 setOperationAction(ISD::ROTR, MVT::i64, Legal); 410 setOperationAction(ISD::ROTL, MVT::i32, Legal); 411 setOperationAction(ISD::ROTR, MVT::i32, Legal); 412 413 setOperationAction(ISD::ROTL, MVT::i16, Expand); 414 setOperationAction(ISD::ROTR, MVT::i16, Expand); 415 setOperationAction(ISD::ROTL, MVT::i8, Expand); 416 setOperationAction(ISD::ROTR, MVT::i8, Expand); 417 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 418 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 419 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 420 421 // Indirect branch is not supported. 422 // This also disables Jump Table creation. 423 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 424 setOperationAction(ISD::BRIND, MVT::Other, Expand); 425 426 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 427 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 428 429 // We want to legalize constant related memmove and memcopy 430 // intrinsics. 431 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 432 433 // Turn FP extload into load/fpextend 434 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 435 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 436 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 437 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 438 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 439 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 440 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 441 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 442 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 443 // Turn FP truncstore into trunc + store. 444 // FIXME: vector types should also be expanded 445 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 446 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 447 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 448 449 // PTX does not support load / store predicate registers 450 setOperationAction(ISD::LOAD, MVT::i1, Custom); 451 setOperationAction(ISD::STORE, MVT::i1, Custom); 452 453 for (MVT VT : MVT::integer_valuetypes()) { 454 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 455 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 456 setTruncStoreAction(VT, MVT::i1, Expand); 457 } 458 459 // This is legal in NVPTX 460 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 461 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 462 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 463 464 // TRAP can be lowered to PTX trap 465 setOperationAction(ISD::TRAP, MVT::Other, Legal); 466 467 // Register custom handling for vector loads/stores 468 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 469 if (IsPTXVectorType(VT)) { 470 setOperationAction(ISD::LOAD, VT, Custom); 471 setOperationAction(ISD::STORE, VT, Custom); 472 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 473 } 474 } 475 476 // Custom handling for i8 intrinsics 477 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 478 479 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 480 setOperationAction(ISD::ABS, Ty, Legal); 481 setOperationAction(ISD::SMIN, Ty, Legal); 482 setOperationAction(ISD::SMAX, Ty, Legal); 483 setOperationAction(ISD::UMIN, Ty, Legal); 484 setOperationAction(ISD::UMAX, Ty, Legal); 485 486 setOperationAction(ISD::CTPOP, Ty, Legal); 487 setOperationAction(ISD::CTLZ, Ty, Legal); 488 } 489 490 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 491 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 492 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 493 494 // PTX does not directly support SELP of i1, so promote to i32 first 495 setOperationAction(ISD::SELECT, MVT::i1, Custom); 496 497 // PTX cannot multiply two i64s in a single instruction. 498 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 499 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 500 501 // We have some custom DAG combine patterns for these nodes 502 setTargetDAGCombine(ISD::ADD); 503 setTargetDAGCombine(ISD::AND); 504 setTargetDAGCombine(ISD::FADD); 505 setTargetDAGCombine(ISD::MUL); 506 setTargetDAGCombine(ISD::SHL); 507 setTargetDAGCombine(ISD::SREM); 508 setTargetDAGCombine(ISD::UREM); 509 510 // setcc for f16x2 needs special handling to prevent legalizer's 511 // attempt to scalarize it due to v2i1 not being legal. 512 if (STI.allowFP16Math()) 513 setTargetDAGCombine(ISD::SETCC); 514 515 // Promote fp16 arithmetic if fp16 hardware isn't available or the 516 // user passed --nvptx-no-fp16-math. The flag is useful because, 517 // although sm_53+ GPUs have some sort of FP16 support in 518 // hardware, only sm_53 and sm_60 have full implementation. Others 519 // only have token amount of hardware and are likely to run faster 520 // by using fp32 units instead. 521 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 522 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 523 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 524 } 525 526 // There's no neg.f16 instruction. Expand to (0-x). 527 setOperationAction(ISD::FNEG, MVT::f16, Expand); 528 setOperationAction(ISD::FNEG, MVT::v2f16, Expand); 529 530 // (would be) Library functions. 531 532 // These map to conversion instructions for scalar FP types. 533 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 534 ISD::FTRUNC}) { 535 setOperationAction(Op, MVT::f16, Legal); 536 setOperationAction(Op, MVT::f32, Legal); 537 setOperationAction(Op, MVT::f64, Legal); 538 setOperationAction(Op, MVT::v2f16, Expand); 539 } 540 541 setOperationAction(ISD::FROUND, MVT::f16, Promote); 542 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 543 setOperationAction(ISD::FROUND, MVT::f32, Custom); 544 setOperationAction(ISD::FROUND, MVT::f64, Custom); 545 546 547 // 'Expand' implements FCOPYSIGN without calling an external library. 548 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 549 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 550 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 551 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 552 553 // These map to corresponding instructions for f32/f64. f16 must be 554 // promoted to f32. v2f16 is expanded to f16, which is then promoted 555 // to f32. 556 for (const auto &Op : 557 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) { 558 setOperationAction(Op, MVT::f16, Promote); 559 setOperationAction(Op, MVT::f32, Legal); 560 setOperationAction(Op, MVT::f64, Legal); 561 setOperationAction(Op, MVT::v2f16, Expand); 562 } 563 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 564 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 565 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 566 return IsAtLeastSm80 ? Legal : NotSm80Action; 567 }; 568 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 569 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 570 setOperationAction(Op, MVT::f32, Legal); 571 setOperationAction(Op, MVT::f64, Legal); 572 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 573 } 574 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 575 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 576 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 577 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 578 } 579 580 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 581 // No FPOW or FREM in PTX. 582 583 // Now deduce the information based on the above mentioned 584 // actions 585 computeRegisterProperties(STI.getRegisterInfo()); 586 } 587 588 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 589 switch ((NVPTXISD::NodeType)Opcode) { 590 case NVPTXISD::FIRST_NUMBER: 591 break; 592 case NVPTXISD::CALL: 593 return "NVPTXISD::CALL"; 594 case NVPTXISD::RET_FLAG: 595 return "NVPTXISD::RET_FLAG"; 596 case NVPTXISD::LOAD_PARAM: 597 return "NVPTXISD::LOAD_PARAM"; 598 case NVPTXISD::Wrapper: 599 return "NVPTXISD::Wrapper"; 600 case NVPTXISD::DeclareParam: 601 return "NVPTXISD::DeclareParam"; 602 case NVPTXISD::DeclareScalarParam: 603 return "NVPTXISD::DeclareScalarParam"; 604 case NVPTXISD::DeclareRet: 605 return "NVPTXISD::DeclareRet"; 606 case NVPTXISD::DeclareScalarRet: 607 return "NVPTXISD::DeclareScalarRet"; 608 case NVPTXISD::DeclareRetParam: 609 return "NVPTXISD::DeclareRetParam"; 610 case NVPTXISD::PrintCall: 611 return "NVPTXISD::PrintCall"; 612 case NVPTXISD::PrintConvergentCall: 613 return "NVPTXISD::PrintConvergentCall"; 614 case NVPTXISD::PrintCallUni: 615 return "NVPTXISD::PrintCallUni"; 616 case NVPTXISD::PrintConvergentCallUni: 617 return "NVPTXISD::PrintConvergentCallUni"; 618 case NVPTXISD::LoadParam: 619 return "NVPTXISD::LoadParam"; 620 case NVPTXISD::LoadParamV2: 621 return "NVPTXISD::LoadParamV2"; 622 case NVPTXISD::LoadParamV4: 623 return "NVPTXISD::LoadParamV4"; 624 case NVPTXISD::StoreParam: 625 return "NVPTXISD::StoreParam"; 626 case NVPTXISD::StoreParamV2: 627 return "NVPTXISD::StoreParamV2"; 628 case NVPTXISD::StoreParamV4: 629 return "NVPTXISD::StoreParamV4"; 630 case NVPTXISD::StoreParamS32: 631 return "NVPTXISD::StoreParamS32"; 632 case NVPTXISD::StoreParamU32: 633 return "NVPTXISD::StoreParamU32"; 634 case NVPTXISD::CallArgBegin: 635 return "NVPTXISD::CallArgBegin"; 636 case NVPTXISD::CallArg: 637 return "NVPTXISD::CallArg"; 638 case NVPTXISD::LastCallArg: 639 return "NVPTXISD::LastCallArg"; 640 case NVPTXISD::CallArgEnd: 641 return "NVPTXISD::CallArgEnd"; 642 case NVPTXISD::CallVoid: 643 return "NVPTXISD::CallVoid"; 644 case NVPTXISD::CallVal: 645 return "NVPTXISD::CallVal"; 646 case NVPTXISD::CallSymbol: 647 return "NVPTXISD::CallSymbol"; 648 case NVPTXISD::Prototype: 649 return "NVPTXISD::Prototype"; 650 case NVPTXISD::MoveParam: 651 return "NVPTXISD::MoveParam"; 652 case NVPTXISD::StoreRetval: 653 return "NVPTXISD::StoreRetval"; 654 case NVPTXISD::StoreRetvalV2: 655 return "NVPTXISD::StoreRetvalV2"; 656 case NVPTXISD::StoreRetvalV4: 657 return "NVPTXISD::StoreRetvalV4"; 658 case NVPTXISD::PseudoUseParam: 659 return "NVPTXISD::PseudoUseParam"; 660 case NVPTXISD::RETURN: 661 return "NVPTXISD::RETURN"; 662 case NVPTXISD::CallSeqBegin: 663 return "NVPTXISD::CallSeqBegin"; 664 case NVPTXISD::CallSeqEnd: 665 return "NVPTXISD::CallSeqEnd"; 666 case NVPTXISD::CallPrototype: 667 return "NVPTXISD::CallPrototype"; 668 case NVPTXISD::ProxyReg: 669 return "NVPTXISD::ProxyReg"; 670 case NVPTXISD::LoadV2: 671 return "NVPTXISD::LoadV2"; 672 case NVPTXISD::LoadV4: 673 return "NVPTXISD::LoadV4"; 674 case NVPTXISD::LDGV2: 675 return "NVPTXISD::LDGV2"; 676 case NVPTXISD::LDGV4: 677 return "NVPTXISD::LDGV4"; 678 case NVPTXISD::LDUV2: 679 return "NVPTXISD::LDUV2"; 680 case NVPTXISD::LDUV4: 681 return "NVPTXISD::LDUV4"; 682 case NVPTXISD::StoreV2: 683 return "NVPTXISD::StoreV2"; 684 case NVPTXISD::StoreV4: 685 return "NVPTXISD::StoreV4"; 686 case NVPTXISD::FUN_SHFL_CLAMP: 687 return "NVPTXISD::FUN_SHFL_CLAMP"; 688 case NVPTXISD::FUN_SHFR_CLAMP: 689 return "NVPTXISD::FUN_SHFR_CLAMP"; 690 case NVPTXISD::IMAD: 691 return "NVPTXISD::IMAD"; 692 case NVPTXISD::SETP_F16X2: 693 return "NVPTXISD::SETP_F16X2"; 694 case NVPTXISD::Dummy: 695 return "NVPTXISD::Dummy"; 696 case NVPTXISD::MUL_WIDE_SIGNED: 697 return "NVPTXISD::MUL_WIDE_SIGNED"; 698 case NVPTXISD::MUL_WIDE_UNSIGNED: 699 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 700 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 701 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 702 case NVPTXISD::Tex1DFloatFloatLevel: 703 return "NVPTXISD::Tex1DFloatFloatLevel"; 704 case NVPTXISD::Tex1DFloatFloatGrad: 705 return "NVPTXISD::Tex1DFloatFloatGrad"; 706 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 707 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 708 case NVPTXISD::Tex1DS32FloatLevel: 709 return "NVPTXISD::Tex1DS32FloatLevel"; 710 case NVPTXISD::Tex1DS32FloatGrad: 711 return "NVPTXISD::Tex1DS32FloatGrad"; 712 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 713 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 714 case NVPTXISD::Tex1DU32FloatLevel: 715 return "NVPTXISD::Tex1DU32FloatLevel"; 716 case NVPTXISD::Tex1DU32FloatGrad: 717 return "NVPTXISD::Tex1DU32FloatGrad"; 718 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 719 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 720 case NVPTXISD::Tex1DArrayFloatFloatLevel: 721 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 722 case NVPTXISD::Tex1DArrayFloatFloatGrad: 723 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 724 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 725 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 726 case NVPTXISD::Tex1DArrayS32FloatLevel: 727 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 728 case NVPTXISD::Tex1DArrayS32FloatGrad: 729 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 730 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 731 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 732 case NVPTXISD::Tex1DArrayU32FloatLevel: 733 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 734 case NVPTXISD::Tex1DArrayU32FloatGrad: 735 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 736 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 737 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 738 case NVPTXISD::Tex2DFloatFloatLevel: 739 return "NVPTXISD::Tex2DFloatFloatLevel"; 740 case NVPTXISD::Tex2DFloatFloatGrad: 741 return "NVPTXISD::Tex2DFloatFloatGrad"; 742 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 743 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 744 case NVPTXISD::Tex2DS32FloatLevel: 745 return "NVPTXISD::Tex2DS32FloatLevel"; 746 case NVPTXISD::Tex2DS32FloatGrad: 747 return "NVPTXISD::Tex2DS32FloatGrad"; 748 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 749 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 750 case NVPTXISD::Tex2DU32FloatLevel: 751 return "NVPTXISD::Tex2DU32FloatLevel"; 752 case NVPTXISD::Tex2DU32FloatGrad: 753 return "NVPTXISD::Tex2DU32FloatGrad"; 754 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 755 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 756 case NVPTXISD::Tex2DArrayFloatFloatLevel: 757 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 758 case NVPTXISD::Tex2DArrayFloatFloatGrad: 759 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 760 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 761 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 762 case NVPTXISD::Tex2DArrayS32FloatLevel: 763 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 764 case NVPTXISD::Tex2DArrayS32FloatGrad: 765 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 766 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 767 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 768 case NVPTXISD::Tex2DArrayU32FloatLevel: 769 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 770 case NVPTXISD::Tex2DArrayU32FloatGrad: 771 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 772 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 773 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 774 case NVPTXISD::Tex3DFloatFloatLevel: 775 return "NVPTXISD::Tex3DFloatFloatLevel"; 776 case NVPTXISD::Tex3DFloatFloatGrad: 777 return "NVPTXISD::Tex3DFloatFloatGrad"; 778 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 779 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 780 case NVPTXISD::Tex3DS32FloatLevel: 781 return "NVPTXISD::Tex3DS32FloatLevel"; 782 case NVPTXISD::Tex3DS32FloatGrad: 783 return "NVPTXISD::Tex3DS32FloatGrad"; 784 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 785 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 786 case NVPTXISD::Tex3DU32FloatLevel: 787 return "NVPTXISD::Tex3DU32FloatLevel"; 788 case NVPTXISD::Tex3DU32FloatGrad: 789 return "NVPTXISD::Tex3DU32FloatGrad"; 790 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 791 case NVPTXISD::TexCubeFloatFloatLevel: 792 return "NVPTXISD::TexCubeFloatFloatLevel"; 793 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 794 case NVPTXISD::TexCubeS32FloatLevel: 795 return "NVPTXISD::TexCubeS32FloatLevel"; 796 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 797 case NVPTXISD::TexCubeU32FloatLevel: 798 return "NVPTXISD::TexCubeU32FloatLevel"; 799 case NVPTXISD::TexCubeArrayFloatFloat: 800 return "NVPTXISD::TexCubeArrayFloatFloat"; 801 case NVPTXISD::TexCubeArrayFloatFloatLevel: 802 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 803 case NVPTXISD::TexCubeArrayS32Float: 804 return "NVPTXISD::TexCubeArrayS32Float"; 805 case NVPTXISD::TexCubeArrayS32FloatLevel: 806 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 807 case NVPTXISD::TexCubeArrayU32Float: 808 return "NVPTXISD::TexCubeArrayU32Float"; 809 case NVPTXISD::TexCubeArrayU32FloatLevel: 810 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 811 case NVPTXISD::Tld4R2DFloatFloat: 812 return "NVPTXISD::Tld4R2DFloatFloat"; 813 case NVPTXISD::Tld4G2DFloatFloat: 814 return "NVPTXISD::Tld4G2DFloatFloat"; 815 case NVPTXISD::Tld4B2DFloatFloat: 816 return "NVPTXISD::Tld4B2DFloatFloat"; 817 case NVPTXISD::Tld4A2DFloatFloat: 818 return "NVPTXISD::Tld4A2DFloatFloat"; 819 case NVPTXISD::Tld4R2DS64Float: 820 return "NVPTXISD::Tld4R2DS64Float"; 821 case NVPTXISD::Tld4G2DS64Float: 822 return "NVPTXISD::Tld4G2DS64Float"; 823 case NVPTXISD::Tld4B2DS64Float: 824 return "NVPTXISD::Tld4B2DS64Float"; 825 case NVPTXISD::Tld4A2DS64Float: 826 return "NVPTXISD::Tld4A2DS64Float"; 827 case NVPTXISD::Tld4R2DU64Float: 828 return "NVPTXISD::Tld4R2DU64Float"; 829 case NVPTXISD::Tld4G2DU64Float: 830 return "NVPTXISD::Tld4G2DU64Float"; 831 case NVPTXISD::Tld4B2DU64Float: 832 return "NVPTXISD::Tld4B2DU64Float"; 833 case NVPTXISD::Tld4A2DU64Float: 834 return "NVPTXISD::Tld4A2DU64Float"; 835 836 case NVPTXISD::TexUnified1DFloatS32: 837 return "NVPTXISD::TexUnified1DFloatS32"; 838 case NVPTXISD::TexUnified1DFloatFloat: 839 return "NVPTXISD::TexUnified1DFloatFloat"; 840 case NVPTXISD::TexUnified1DFloatFloatLevel: 841 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 842 case NVPTXISD::TexUnified1DFloatFloatGrad: 843 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 844 case NVPTXISD::TexUnified1DS32S32: 845 return "NVPTXISD::TexUnified1DS32S32"; 846 case NVPTXISD::TexUnified1DS32Float: 847 return "NVPTXISD::TexUnified1DS32Float"; 848 case NVPTXISD::TexUnified1DS32FloatLevel: 849 return "NVPTXISD::TexUnified1DS32FloatLevel"; 850 case NVPTXISD::TexUnified1DS32FloatGrad: 851 return "NVPTXISD::TexUnified1DS32FloatGrad"; 852 case NVPTXISD::TexUnified1DU32S32: 853 return "NVPTXISD::TexUnified1DU32S32"; 854 case NVPTXISD::TexUnified1DU32Float: 855 return "NVPTXISD::TexUnified1DU32Float"; 856 case NVPTXISD::TexUnified1DU32FloatLevel: 857 return "NVPTXISD::TexUnified1DU32FloatLevel"; 858 case NVPTXISD::TexUnified1DU32FloatGrad: 859 return "NVPTXISD::TexUnified1DU32FloatGrad"; 860 case NVPTXISD::TexUnified1DArrayFloatS32: 861 return "NVPTXISD::TexUnified1DArrayFloatS32"; 862 case NVPTXISD::TexUnified1DArrayFloatFloat: 863 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 864 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 865 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 866 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 867 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 868 case NVPTXISD::TexUnified1DArrayS32S32: 869 return "NVPTXISD::TexUnified1DArrayS32S32"; 870 case NVPTXISD::TexUnified1DArrayS32Float: 871 return "NVPTXISD::TexUnified1DArrayS32Float"; 872 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 873 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 874 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 875 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 876 case NVPTXISD::TexUnified1DArrayU32S32: 877 return "NVPTXISD::TexUnified1DArrayU32S32"; 878 case NVPTXISD::TexUnified1DArrayU32Float: 879 return "NVPTXISD::TexUnified1DArrayU32Float"; 880 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 881 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 882 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 883 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 884 case NVPTXISD::TexUnified2DFloatS32: 885 return "NVPTXISD::TexUnified2DFloatS32"; 886 case NVPTXISD::TexUnified2DFloatFloat: 887 return "NVPTXISD::TexUnified2DFloatFloat"; 888 case NVPTXISD::TexUnified2DFloatFloatLevel: 889 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 890 case NVPTXISD::TexUnified2DFloatFloatGrad: 891 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 892 case NVPTXISD::TexUnified2DS32S32: 893 return "NVPTXISD::TexUnified2DS32S32"; 894 case NVPTXISD::TexUnified2DS32Float: 895 return "NVPTXISD::TexUnified2DS32Float"; 896 case NVPTXISD::TexUnified2DS32FloatLevel: 897 return "NVPTXISD::TexUnified2DS32FloatLevel"; 898 case NVPTXISD::TexUnified2DS32FloatGrad: 899 return "NVPTXISD::TexUnified2DS32FloatGrad"; 900 case NVPTXISD::TexUnified2DU32S32: 901 return "NVPTXISD::TexUnified2DU32S32"; 902 case NVPTXISD::TexUnified2DU32Float: 903 return "NVPTXISD::TexUnified2DU32Float"; 904 case NVPTXISD::TexUnified2DU32FloatLevel: 905 return "NVPTXISD::TexUnified2DU32FloatLevel"; 906 case NVPTXISD::TexUnified2DU32FloatGrad: 907 return "NVPTXISD::TexUnified2DU32FloatGrad"; 908 case NVPTXISD::TexUnified2DArrayFloatS32: 909 return "NVPTXISD::TexUnified2DArrayFloatS32"; 910 case NVPTXISD::TexUnified2DArrayFloatFloat: 911 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 912 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 913 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 914 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 915 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 916 case NVPTXISD::TexUnified2DArrayS32S32: 917 return "NVPTXISD::TexUnified2DArrayS32S32"; 918 case NVPTXISD::TexUnified2DArrayS32Float: 919 return "NVPTXISD::TexUnified2DArrayS32Float"; 920 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 921 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 922 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 923 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 924 case NVPTXISD::TexUnified2DArrayU32S32: 925 return "NVPTXISD::TexUnified2DArrayU32S32"; 926 case NVPTXISD::TexUnified2DArrayU32Float: 927 return "NVPTXISD::TexUnified2DArrayU32Float"; 928 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 929 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 930 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 931 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 932 case NVPTXISD::TexUnified3DFloatS32: 933 return "NVPTXISD::TexUnified3DFloatS32"; 934 case NVPTXISD::TexUnified3DFloatFloat: 935 return "NVPTXISD::TexUnified3DFloatFloat"; 936 case NVPTXISD::TexUnified3DFloatFloatLevel: 937 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 938 case NVPTXISD::TexUnified3DFloatFloatGrad: 939 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 940 case NVPTXISD::TexUnified3DS32S32: 941 return "NVPTXISD::TexUnified3DS32S32"; 942 case NVPTXISD::TexUnified3DS32Float: 943 return "NVPTXISD::TexUnified3DS32Float"; 944 case NVPTXISD::TexUnified3DS32FloatLevel: 945 return "NVPTXISD::TexUnified3DS32FloatLevel"; 946 case NVPTXISD::TexUnified3DS32FloatGrad: 947 return "NVPTXISD::TexUnified3DS32FloatGrad"; 948 case NVPTXISD::TexUnified3DU32S32: 949 return "NVPTXISD::TexUnified3DU32S32"; 950 case NVPTXISD::TexUnified3DU32Float: 951 return "NVPTXISD::TexUnified3DU32Float"; 952 case NVPTXISD::TexUnified3DU32FloatLevel: 953 return "NVPTXISD::TexUnified3DU32FloatLevel"; 954 case NVPTXISD::TexUnified3DU32FloatGrad: 955 return "NVPTXISD::TexUnified3DU32FloatGrad"; 956 case NVPTXISD::TexUnifiedCubeFloatFloat: 957 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 958 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 959 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 960 case NVPTXISD::TexUnifiedCubeS32Float: 961 return "NVPTXISD::TexUnifiedCubeS32Float"; 962 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 963 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 964 case NVPTXISD::TexUnifiedCubeU32Float: 965 return "NVPTXISD::TexUnifiedCubeU32Float"; 966 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 967 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 968 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 969 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 970 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 971 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 972 case NVPTXISD::TexUnifiedCubeArrayS32Float: 973 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 974 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 975 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 976 case NVPTXISD::TexUnifiedCubeArrayU32Float: 977 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 978 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 979 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 980 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 981 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 982 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 983 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 984 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 985 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 986 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 987 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 988 case NVPTXISD::Tld4UnifiedR2DS64Float: 989 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 990 case NVPTXISD::Tld4UnifiedG2DS64Float: 991 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 992 case NVPTXISD::Tld4UnifiedB2DS64Float: 993 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 994 case NVPTXISD::Tld4UnifiedA2DS64Float: 995 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 996 case NVPTXISD::Tld4UnifiedR2DU64Float: 997 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 998 case NVPTXISD::Tld4UnifiedG2DU64Float: 999 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1000 case NVPTXISD::Tld4UnifiedB2DU64Float: 1001 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1002 case NVPTXISD::Tld4UnifiedA2DU64Float: 1003 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1004 1005 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1006 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1007 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1008 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1009 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1010 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1011 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1012 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1013 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1014 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1015 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1016 1017 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1018 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1019 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1020 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1021 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1022 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1023 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1024 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1025 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1026 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1027 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1028 1029 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1030 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1031 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1032 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1033 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1034 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1035 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1036 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1037 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1038 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1039 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1040 1041 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1042 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1043 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1044 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1045 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1046 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1047 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1048 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1049 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1050 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1051 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1052 1053 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1054 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1055 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1056 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1057 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1058 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1059 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1060 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1061 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1062 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1063 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1064 1065 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1066 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1067 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1068 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1069 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1070 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1071 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1072 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1073 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1074 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1075 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1076 1077 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1078 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1079 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1080 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1081 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1082 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1083 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1084 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1085 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1086 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1087 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1088 1089 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1090 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1091 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1092 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1093 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1094 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1095 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1096 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1097 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1098 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1099 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1100 1101 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1102 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1103 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1104 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1105 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1106 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1107 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1108 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1109 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1110 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1111 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1112 1113 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1114 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1115 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1116 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1117 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1118 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1119 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1120 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1121 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1122 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1123 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1124 1125 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1126 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1127 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1128 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1129 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1130 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1131 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1132 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1133 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1134 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1135 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1136 1137 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1138 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1139 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1140 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1141 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1142 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1143 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1144 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1145 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1146 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1147 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1148 1149 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1150 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1151 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1152 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1153 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1154 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1155 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1156 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1157 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1158 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1159 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1160 1161 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1162 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1163 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1164 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1165 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1166 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1167 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1168 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1169 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1170 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1171 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1172 1173 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1174 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1175 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1176 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1177 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1178 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1179 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1180 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1181 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1182 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1183 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1184 } 1185 return nullptr; 1186 } 1187 1188 TargetLoweringBase::LegalizeTypeAction 1189 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1190 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1191 VT.getScalarType() == MVT::i1) 1192 return TypeSplitVector; 1193 if (VT == MVT::v2f16) 1194 return TypeLegal; 1195 return TargetLoweringBase::getPreferredVectorAction(VT); 1196 } 1197 1198 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1199 int Enabled, int &ExtraSteps, 1200 bool &UseOneConst, 1201 bool Reciprocal) const { 1202 if (!(Enabled == ReciprocalEstimate::Enabled || 1203 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1204 return SDValue(); 1205 1206 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1207 ExtraSteps = 0; 1208 1209 SDLoc DL(Operand); 1210 EVT VT = Operand.getValueType(); 1211 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1212 1213 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1214 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1215 DAG.getConstant(IID, DL, MVT::i32), Operand); 1216 }; 1217 1218 // The sqrt and rsqrt refinement processes assume we always start out with an 1219 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1220 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1221 // any refinement, we must return a regular sqrt. 1222 if (Reciprocal || ExtraSteps > 0) { 1223 if (VT == MVT::f32) 1224 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1225 : Intrinsic::nvvm_rsqrt_approx_f); 1226 else if (VT == MVT::f64) 1227 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1228 else 1229 return SDValue(); 1230 } else { 1231 if (VT == MVT::f32) 1232 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1233 : Intrinsic::nvvm_sqrt_approx_f); 1234 else { 1235 // There's no sqrt.approx.f64 instruction, so we emit 1236 // reciprocal(rsqrt(x)). This is faster than 1237 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1238 // x * rsqrt(x).) 1239 return DAG.getNode( 1240 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1241 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1242 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1243 } 1244 } 1245 } 1246 1247 SDValue 1248 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1249 SDLoc dl(Op); 1250 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1251 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1252 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1253 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1254 } 1255 1256 std::string NVPTXTargetLowering::getPrototype( 1257 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1258 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1259 const CallBase &CB, unsigned UniqueCallSite) const { 1260 auto PtrVT = getPointerTy(DL); 1261 1262 bool isABI = (STI.getSmVersion() >= 20); 1263 assert(isABI && "Non-ABI compilation is not supported"); 1264 if (!isABI) 1265 return ""; 1266 1267 std::stringstream O; 1268 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1269 1270 if (retTy->getTypeID() == Type::VoidTyID) { 1271 O << "()"; 1272 } else { 1273 O << "("; 1274 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1275 unsigned size = 0; 1276 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1277 size = ITy->getBitWidth(); 1278 } else { 1279 assert(retTy->isFloatingPointTy() && 1280 "Floating point type expected here"); 1281 size = retTy->getPrimitiveSizeInBits(); 1282 } 1283 // PTX ABI requires all scalar return values to be at least 32 1284 // bits in size. fp16 normally uses .b16 as its storage type in 1285 // PTX, so its size must be adjusted here, too. 1286 if (size < 32) 1287 size = 32; 1288 1289 O << ".param .b" << size << " _"; 1290 } else if (isa<PointerType>(retTy)) { 1291 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1292 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1293 retTy->isIntegerTy(128)) { 1294 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1295 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1296 } else { 1297 llvm_unreachable("Unknown return type"); 1298 } 1299 O << ") "; 1300 } 1301 O << "_ ("; 1302 1303 bool first = true; 1304 1305 unsigned OIdx = 0; 1306 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1307 Type *Ty = Args[i].Ty; 1308 if (!first) { 1309 O << ", "; 1310 } 1311 first = false; 1312 1313 if (!Outs[OIdx].Flags.isByVal()) { 1314 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1315 unsigned align = 0; 1316 const CallInst *CallI = cast<CallInst>(&CB); 1317 // +1 because index 0 is reserved for return type alignment 1318 if (!getAlign(*CallI, i + 1, align)) 1319 align = DL.getABITypeAlignment(Ty); 1320 unsigned sz = DL.getTypeAllocSize(Ty); 1321 O << ".param .align " << align << " .b8 "; 1322 O << "_"; 1323 O << "[" << sz << "]"; 1324 // update the index for Outs 1325 SmallVector<EVT, 16> vtparts; 1326 ComputeValueVTs(*this, DL, Ty, vtparts); 1327 if (unsigned len = vtparts.size()) 1328 OIdx += len - 1; 1329 continue; 1330 } 1331 // i8 types in IR will be i16 types in SDAG 1332 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1333 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1334 "type mismatch between callee prototype and arguments"); 1335 // scalar type 1336 unsigned sz = 0; 1337 if (isa<IntegerType>(Ty)) { 1338 sz = cast<IntegerType>(Ty)->getBitWidth(); 1339 if (sz < 32) 1340 sz = 32; 1341 } else if (isa<PointerType>(Ty)) { 1342 sz = PtrVT.getSizeInBits(); 1343 } else if (Ty->isHalfTy()) 1344 // PTX ABI requires all scalar parameters to be at least 32 1345 // bits in size. fp16 normally uses .b16 as its storage type 1346 // in PTX, so its size must be adjusted here, too. 1347 sz = 32; 1348 else 1349 sz = Ty->getPrimitiveSizeInBits(); 1350 O << ".param .b" << sz << " "; 1351 O << "_"; 1352 continue; 1353 } 1354 auto *PTy = dyn_cast<PointerType>(Ty); 1355 assert(PTy && "Param with byval attribute should be a pointer type"); 1356 Type *ETy = PTy->getPointerElementType(); 1357 1358 Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); 1359 unsigned sz = DL.getTypeAllocSize(ETy); 1360 O << ".param .align " << align.value() << " .b8 "; 1361 O << "_"; 1362 O << "[" << sz << "]"; 1363 } 1364 O << ");"; 1365 return O.str(); 1366 } 1367 1368 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1369 const CallBase *CB, Type *Ty, 1370 unsigned Idx, 1371 const DataLayout &DL) const { 1372 if (!CB) { 1373 // CallSite is zero, fallback to ABI type alignment 1374 return DL.getABITypeAlign(Ty); 1375 } 1376 1377 unsigned Alignment = 0; 1378 const Function *DirectCallee = CB->getCalledFunction(); 1379 1380 if (!DirectCallee) { 1381 // We don't have a direct function symbol, but that may be because of 1382 // constant cast instructions in the call. 1383 1384 // With bitcast'd call targets, the instruction will be the call 1385 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1386 // Check if we have call alignment metadata 1387 if (getAlign(*CI, Idx, Alignment)) 1388 return Align(Alignment); 1389 1390 const Value *CalleeV = CI->getCalledOperand(); 1391 // Ignore any bitcast instructions 1392 while (isa<ConstantExpr>(CalleeV)) { 1393 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 1394 if (!CE->isCast()) 1395 break; 1396 // Look through the bitcast 1397 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 1398 } 1399 1400 // We have now looked past all of the bitcasts. Do we finally have a 1401 // Function? 1402 if (const auto *CalleeF = dyn_cast<Function>(CalleeV)) 1403 DirectCallee = CalleeF; 1404 } 1405 } 1406 1407 // Check for function alignment information if we found that the 1408 // ultimate target is a Function 1409 if (DirectCallee) 1410 if (getAlign(*DirectCallee, Idx, Alignment)) 1411 return Align(Alignment); 1412 1413 // Call is indirect or alignment information is not available, fall back to 1414 // the ABI type alignment 1415 return DL.getABITypeAlign(Ty); 1416 } 1417 1418 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1419 SmallVectorImpl<SDValue> &InVals) const { 1420 SelectionDAG &DAG = CLI.DAG; 1421 SDLoc dl = CLI.DL; 1422 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1423 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1424 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1425 SDValue Chain = CLI.Chain; 1426 SDValue Callee = CLI.Callee; 1427 bool &isTailCall = CLI.IsTailCall; 1428 ArgListTy &Args = CLI.getArgs(); 1429 Type *RetTy = CLI.RetTy; 1430 const CallBase *CB = CLI.CB; 1431 const DataLayout &DL = DAG.getDataLayout(); 1432 1433 bool isABI = (STI.getSmVersion() >= 20); 1434 assert(isABI && "Non-ABI compilation is not supported"); 1435 if (!isABI) 1436 return Chain; 1437 1438 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1439 SDValue tempChain = Chain; 1440 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1441 SDValue InFlag = Chain.getValue(1); 1442 1443 unsigned paramCount = 0; 1444 // Args.size() and Outs.size() need not match. 1445 // Outs.size() will be larger 1446 // * if there is an aggregate argument with multiple fields (each field 1447 // showing up separately in Outs) 1448 // * if there is a vector argument with more than typical vector-length 1449 // elements (generally if more than 4) where each vector element is 1450 // individually present in Outs. 1451 // So a different index should be used for indexing into Outs/OutVals. 1452 // See similar issue in LowerFormalArguments. 1453 unsigned OIdx = 0; 1454 // Declare the .params or .reg need to pass values 1455 // to the function 1456 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1457 EVT VT = Outs[OIdx].VT; 1458 Type *Ty = Args[i].Ty; 1459 1460 if (!Outs[OIdx].Flags.isByVal()) { 1461 SmallVector<EVT, 16> VTs; 1462 SmallVector<uint64_t, 16> Offsets; 1463 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); 1464 Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL); 1465 unsigned AllocSize = DL.getTypeAllocSize(Ty); 1466 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1467 bool NeedAlign; // Does argument declaration specify alignment? 1468 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1469 // declare .param .align <align> .b8 .param<n>[<size>]; 1470 SDValue DeclareParamOps[] = { 1471 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1472 DAG.getConstant(paramCount, dl, MVT::i32), 1473 DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; 1474 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1475 DeclareParamOps); 1476 NeedAlign = true; 1477 } else { 1478 // declare .param .b<size> .param<n>; 1479 if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { 1480 // PTX ABI requires integral types to be at least 32 bits in 1481 // size. FP16 is loaded/stored using i16, so it's handled 1482 // here as well. 1483 AllocSize = 4; 1484 } 1485 SDValue DeclareScalarParamOps[] = { 1486 Chain, DAG.getConstant(paramCount, dl, MVT::i32), 1487 DAG.getConstant(AllocSize * 8, dl, MVT::i32), 1488 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1489 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1490 DeclareScalarParamOps); 1491 NeedAlign = false; 1492 } 1493 InFlag = Chain.getValue(1); 1494 1495 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1496 // than 32-bits are sign extended or zero extended, depending on 1497 // whether they are signed or unsigned types. This case applies 1498 // only to scalar parameters and not to aggregate values. 1499 bool ExtendIntegerParam = 1500 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1501 1502 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 1503 SmallVector<SDValue, 6> StoreOperands; 1504 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1505 // New store. 1506 if (VectorInfo[j] & PVF_FIRST) { 1507 assert(StoreOperands.empty() && "Unfinished preceding store."); 1508 StoreOperands.push_back(Chain); 1509 StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); 1510 StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); 1511 } 1512 1513 EVT EltVT = VTs[j]; 1514 SDValue StVal = OutVals[OIdx]; 1515 if (ExtendIntegerParam) { 1516 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1517 // zext/sext to i32 1518 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1519 : ISD::ZERO_EXTEND, 1520 dl, MVT::i32, StVal); 1521 } else if (EltVT.getSizeInBits() < 16) { 1522 // Use 16-bit registers for small stores as it's the 1523 // smallest general purpose register size supported by NVPTX. 1524 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1525 } 1526 1527 // Record the value to store. 1528 StoreOperands.push_back(StVal); 1529 1530 if (VectorInfo[j] & PVF_LAST) { 1531 unsigned NumElts = StoreOperands.size() - 3; 1532 NVPTXISD::NodeType Op; 1533 switch (NumElts) { 1534 case 1: 1535 Op = NVPTXISD::StoreParam; 1536 break; 1537 case 2: 1538 Op = NVPTXISD::StoreParamV2; 1539 break; 1540 case 4: 1541 Op = NVPTXISD::StoreParamV4; 1542 break; 1543 default: 1544 llvm_unreachable("Invalid vector info."); 1545 } 1546 1547 StoreOperands.push_back(InFlag); 1548 1549 // Adjust type of the store op if we've extended the scalar 1550 // return value. 1551 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; 1552 MaybeAlign EltAlign; 1553 if (NeedAlign) 1554 EltAlign = commonAlignment(ArgAlign, Offsets[j]); 1555 1556 Chain = DAG.getMemIntrinsicNode( 1557 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1558 TheStoreType, MachinePointerInfo(), EltAlign, 1559 MachineMemOperand::MOStore); 1560 InFlag = Chain.getValue(1); 1561 1562 // Cleanup. 1563 StoreOperands.clear(); 1564 } 1565 ++OIdx; 1566 } 1567 assert(StoreOperands.empty() && "Unfinished parameter store."); 1568 if (VTs.size() > 0) 1569 --OIdx; 1570 ++paramCount; 1571 continue; 1572 } 1573 1574 // ByVal arguments 1575 SmallVector<EVT, 16> VTs; 1576 SmallVector<uint64_t, 16> Offsets; 1577 auto *PTy = dyn_cast<PointerType>(Args[i].Ty); 1578 assert(PTy && "Type of a byval parameter should be pointer"); 1579 ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets, 1580 0); 1581 1582 // declare .param .align <align> .b8 .param<n>[<size>]; 1583 unsigned sz = Outs[OIdx].Flags.getByValSize(); 1584 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1585 Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1586 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, 1587 // so we don't need to worry about natural alignment or not. 1588 // See TargetLowering::LowerCallTo(). 1589 1590 // Enforce minumum alignment of 4 to work around ptxas miscompile 1591 // for sm_50+. See corresponding alignment adjustment in 1592 // emitFunctionParamList() for details. 1593 if (ArgAlign < Align(4)) 1594 ArgAlign = Align(4); 1595 SDValue DeclareParamOps[] = { 1596 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1597 DAG.getConstant(paramCount, dl, MVT::i32), 1598 DAG.getConstant(sz, dl, MVT::i32), InFlag}; 1599 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1600 DeclareParamOps); 1601 InFlag = Chain.getValue(1); 1602 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1603 EVT elemtype = VTs[j]; 1604 int curOffset = Offsets[j]; 1605 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); 1606 auto PtrVT = getPointerTy(DL); 1607 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], 1608 DAG.getConstant(curOffset, dl, PtrVT)); 1609 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, 1610 MachinePointerInfo(), PartAlign); 1611 if (elemtype.getSizeInBits() < 16) { 1612 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); 1613 } 1614 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1615 SDValue CopyParamOps[] = { Chain, 1616 DAG.getConstant(paramCount, dl, MVT::i32), 1617 DAG.getConstant(curOffset, dl, MVT::i32), 1618 theVal, InFlag }; 1619 Chain = DAG.getMemIntrinsicNode( 1620 NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, 1621 MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); 1622 1623 InFlag = Chain.getValue(1); 1624 } 1625 ++paramCount; 1626 } 1627 1628 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1629 MaybeAlign retAlignment = None; 1630 1631 // Handle Result 1632 if (Ins.size() > 0) { 1633 SmallVector<EVT, 16> resvtparts; 1634 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1635 1636 // Declare 1637 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1638 // .param .b<size-in-bits> retval0 1639 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1640 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1641 // these three types to match the logic in 1642 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1643 // Plus, this behavior is consistent with nvcc's. 1644 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1645 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1646 // Scalar needs to be at least 32bit wide 1647 if (resultsz < 32) 1648 resultsz = 32; 1649 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1650 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1651 DAG.getConstant(resultsz, dl, MVT::i32), 1652 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1653 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1654 DeclareRetOps); 1655 InFlag = Chain.getValue(1); 1656 } else { 1657 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1658 assert(retAlignment && "retAlignment is guaranteed to be set"); 1659 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1660 SDValue DeclareRetOps[] = { 1661 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1662 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1663 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1664 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1665 DeclareRetOps); 1666 InFlag = Chain.getValue(1); 1667 } 1668 } 1669 1670 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1671 // between them we must rely on the call site value which is valid for 1672 // indirect calls but is always null for libcalls. 1673 bool isIndirectCall = !Func && CB; 1674 1675 if (isa<ExternalSymbolSDNode>(Callee)) { 1676 Function* CalleeFunc = nullptr; 1677 1678 // Try to find the callee in the current module. 1679 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1680 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1681 1682 // Set the "libcall callee" attribute to indicate that the function 1683 // must always have a declaration. 1684 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1685 } 1686 1687 if (isIndirectCall) { 1688 // This is indirect function call case : PTX requires a prototype of the 1689 // form 1690 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1691 // to be emitted, and the label has to used as the last arg of call 1692 // instruction. 1693 // The prototype is embedded in a string and put as the operand for a 1694 // CallPrototype SDNode which will print out to the value of the string. 1695 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1696 std::string Proto = 1697 getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite); 1698 const char *ProtoStr = 1699 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1700 SDValue ProtoOps[] = { 1701 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1702 }; 1703 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1704 InFlag = Chain.getValue(1); 1705 } 1706 // Op to just print "call" 1707 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1708 SDValue PrintCallOps[] = { 1709 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1710 }; 1711 // We model convergent calls as separate opcodes. 1712 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1713 if (CLI.IsConvergent) 1714 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1715 : NVPTXISD::PrintConvergentCall; 1716 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1717 InFlag = Chain.getValue(1); 1718 1719 // Ops to print out the function name 1720 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1721 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1722 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1723 InFlag = Chain.getValue(1); 1724 1725 // Ops to print out the param list 1726 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1727 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1728 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1729 CallArgBeginOps); 1730 InFlag = Chain.getValue(1); 1731 1732 for (unsigned i = 0, e = paramCount; i != e; ++i) { 1733 unsigned opcode; 1734 if (i == (e - 1)) 1735 opcode = NVPTXISD::LastCallArg; 1736 else 1737 opcode = NVPTXISD::CallArg; 1738 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1739 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1740 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1741 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1742 InFlag = Chain.getValue(1); 1743 } 1744 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1745 SDValue CallArgEndOps[] = { Chain, 1746 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1747 InFlag }; 1748 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1749 InFlag = Chain.getValue(1); 1750 1751 if (isIndirectCall) { 1752 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1753 SDValue PrototypeOps[] = { 1754 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag}; 1755 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1756 InFlag = Chain.getValue(1); 1757 } 1758 1759 SmallVector<SDValue, 16> ProxyRegOps; 1760 SmallVector<Optional<MVT>, 16> ProxyRegTruncates; 1761 1762 // Generate loads from param memory/moves from registers for result 1763 if (Ins.size() > 0) { 1764 SmallVector<EVT, 16> VTs; 1765 SmallVector<uint64_t, 16> Offsets; 1766 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1767 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1768 1769 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1770 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1771 1772 SmallVector<EVT, 6> LoadVTs; 1773 int VecIdx = -1; // Index of the first element of the vector. 1774 1775 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1776 // 32-bits are sign extended or zero extended, depending on whether 1777 // they are signed or unsigned types. 1778 bool ExtendIntegerRetVal = 1779 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1780 1781 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1782 bool needTruncate = false; 1783 EVT TheLoadType = VTs[i]; 1784 EVT EltType = Ins[i].VT; 1785 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 1786 if (ExtendIntegerRetVal) { 1787 TheLoadType = MVT::i32; 1788 EltType = MVT::i32; 1789 needTruncate = true; 1790 } else if (TheLoadType.getSizeInBits() < 16) { 1791 if (VTs[i].isInteger()) 1792 needTruncate = true; 1793 EltType = MVT::i16; 1794 } 1795 1796 // Record index of the very first element of the vector. 1797 if (VectorInfo[i] & PVF_FIRST) { 1798 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1799 VecIdx = i; 1800 } 1801 1802 LoadVTs.push_back(EltType); 1803 1804 if (VectorInfo[i] & PVF_LAST) { 1805 unsigned NumElts = LoadVTs.size(); 1806 LoadVTs.push_back(MVT::Other); 1807 LoadVTs.push_back(MVT::Glue); 1808 NVPTXISD::NodeType Op; 1809 switch (NumElts) { 1810 case 1: 1811 Op = NVPTXISD::LoadParam; 1812 break; 1813 case 2: 1814 Op = NVPTXISD::LoadParamV2; 1815 break; 1816 case 4: 1817 Op = NVPTXISD::LoadParamV4; 1818 break; 1819 default: 1820 llvm_unreachable("Invalid vector info."); 1821 } 1822 1823 SDValue LoadOperands[] = { 1824 Chain, DAG.getConstant(1, dl, MVT::i32), 1825 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1826 SDValue RetVal = DAG.getMemIntrinsicNode( 1827 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1828 MachinePointerInfo(), EltAlign, 1829 MachineMemOperand::MOLoad); 1830 1831 for (unsigned j = 0; j < NumElts; ++j) { 1832 ProxyRegOps.push_back(RetVal.getValue(j)); 1833 1834 if (needTruncate) 1835 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT)); 1836 else 1837 ProxyRegTruncates.push_back(Optional<MVT>()); 1838 } 1839 1840 Chain = RetVal.getValue(NumElts); 1841 InFlag = RetVal.getValue(NumElts + 1); 1842 1843 // Cleanup 1844 VecIdx = -1; 1845 LoadVTs.clear(); 1846 } 1847 } 1848 } 1849 1850 Chain = DAG.getCALLSEQ_END( 1851 Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true), 1852 DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl); 1853 InFlag = Chain.getValue(1); 1854 1855 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1856 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1857 // dangling. 1858 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1859 SDValue Ret = DAG.getNode( 1860 NVPTXISD::ProxyReg, dl, 1861 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1862 { Chain, ProxyRegOps[i], InFlag } 1863 ); 1864 1865 Chain = Ret.getValue(1); 1866 InFlag = Ret.getValue(2); 1867 1868 if (ProxyRegTruncates[i].hasValue()) { 1869 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); 1870 } 1871 1872 InVals.push_back(Ret); 1873 } 1874 1875 // set isTailCall to false for now, until we figure out how to express 1876 // tail call optimization in PTX 1877 isTailCall = false; 1878 return Chain; 1879 } 1880 1881 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1882 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1883 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1884 SDValue 1885 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1886 SDNode *Node = Op.getNode(); 1887 SDLoc dl(Node); 1888 SmallVector<SDValue, 8> Ops; 1889 unsigned NumOperands = Node->getNumOperands(); 1890 for (unsigned i = 0; i < NumOperands; ++i) { 1891 SDValue SubOp = Node->getOperand(i); 1892 EVT VVT = SubOp.getNode()->getValueType(0); 1893 EVT EltVT = VVT.getVectorElementType(); 1894 unsigned NumSubElem = VVT.getVectorNumElements(); 1895 for (unsigned j = 0; j < NumSubElem; ++j) { 1896 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1897 DAG.getIntPtrConstant(j, dl))); 1898 } 1899 } 1900 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 1901 } 1902 1903 // We can init constant f16x2 with a single .b32 move. Normally it 1904 // would get lowered as two constant loads and vector-packing move. 1905 // mov.b16 %h1, 0x4000; 1906 // mov.b16 %h2, 0x3C00; 1907 // mov.b32 %hh2, {%h2, %h1}; 1908 // Instead we want just a constant move: 1909 // mov.b32 %hh2, 0x40003C00 1910 // 1911 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 1912 // generates good SASS in both cases. 1913 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 1914 SelectionDAG &DAG) const { 1915 //return Op; 1916 if (!(Op->getValueType(0) == MVT::v2f16 && 1917 isa<ConstantFPSDNode>(Op->getOperand(0)) && 1918 isa<ConstantFPSDNode>(Op->getOperand(1)))) 1919 return Op; 1920 1921 APInt E0 = 1922 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 1923 APInt E1 = 1924 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 1925 SDValue Const = 1926 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 1927 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 1928 } 1929 1930 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 1931 SelectionDAG &DAG) const { 1932 SDValue Index = Op->getOperand(1); 1933 // Constant index will be matched by tablegen. 1934 if (isa<ConstantSDNode>(Index.getNode())) 1935 return Op; 1936 1937 // Extract individual elements and select one of them. 1938 SDValue Vector = Op->getOperand(0); 1939 EVT VectorVT = Vector.getValueType(); 1940 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 1941 EVT EltVT = VectorVT.getVectorElementType(); 1942 1943 SDLoc dl(Op.getNode()); 1944 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1945 DAG.getIntPtrConstant(0, dl)); 1946 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1947 DAG.getIntPtrConstant(1, dl)); 1948 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 1949 ISD::CondCode::SETEQ); 1950 } 1951 1952 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1953 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1954 /// amount, or 1955 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1956 /// amount. 1957 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 1958 SelectionDAG &DAG) const { 1959 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1960 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 1961 1962 EVT VT = Op.getValueType(); 1963 unsigned VTBits = VT.getSizeInBits(); 1964 SDLoc dl(Op); 1965 SDValue ShOpLo = Op.getOperand(0); 1966 SDValue ShOpHi = Op.getOperand(1); 1967 SDValue ShAmt = Op.getOperand(2); 1968 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 1969 1970 if (VTBits == 32 && STI.getSmVersion() >= 35) { 1971 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1972 // {dHi, dLo} = {aHi, aLo} >> Amt 1973 // dHi = aHi >> Amt 1974 // dLo = shf.r.clamp aLo, aHi, Amt 1975 1976 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1977 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 1978 ShAmt); 1979 1980 SDValue Ops[2] = { Lo, Hi }; 1981 return DAG.getMergeValues(Ops, dl); 1982 } 1983 else { 1984 // {dHi, dLo} = {aHi, aLo} >> Amt 1985 // - if (Amt>=size) then 1986 // dLo = aHi >> (Amt-size) 1987 // dHi = aHi >> Amt (this is either all 0 or all 1) 1988 // else 1989 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 1990 // dHi = aHi >> Amt 1991 1992 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1993 DAG.getConstant(VTBits, dl, MVT::i32), 1994 ShAmt); 1995 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 1996 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 1997 DAG.getConstant(VTBits, dl, MVT::i32)); 1998 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 1999 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2000 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2001 2002 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2003 DAG.getConstant(VTBits, dl, MVT::i32), 2004 ISD::SETGE); 2005 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2006 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2007 2008 SDValue Ops[2] = { Lo, Hi }; 2009 return DAG.getMergeValues(Ops, dl); 2010 } 2011 } 2012 2013 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2014 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2015 /// amount, or 2016 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2017 /// amount. 2018 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2019 SelectionDAG &DAG) const { 2020 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2021 assert(Op.getOpcode() == ISD::SHL_PARTS); 2022 2023 EVT VT = Op.getValueType(); 2024 unsigned VTBits = VT.getSizeInBits(); 2025 SDLoc dl(Op); 2026 SDValue ShOpLo = Op.getOperand(0); 2027 SDValue ShOpHi = Op.getOperand(1); 2028 SDValue ShAmt = Op.getOperand(2); 2029 2030 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2031 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2032 // {dHi, dLo} = {aHi, aLo} << Amt 2033 // dHi = shf.l.clamp aLo, aHi, Amt 2034 // dLo = aLo << Amt 2035 2036 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2037 ShAmt); 2038 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2039 2040 SDValue Ops[2] = { Lo, Hi }; 2041 return DAG.getMergeValues(Ops, dl); 2042 } 2043 else { 2044 // {dHi, dLo} = {aHi, aLo} << Amt 2045 // - if (Amt>=size) then 2046 // dLo = aLo << Amt (all 0) 2047 // dLo = aLo << (Amt-size) 2048 // else 2049 // dLo = aLo << Amt 2050 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2051 2052 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2053 DAG.getConstant(VTBits, dl, MVT::i32), 2054 ShAmt); 2055 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2056 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2057 DAG.getConstant(VTBits, dl, MVT::i32)); 2058 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2059 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2060 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2061 2062 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2063 DAG.getConstant(VTBits, dl, MVT::i32), 2064 ISD::SETGE); 2065 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2066 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2067 2068 SDValue Ops[2] = { Lo, Hi }; 2069 return DAG.getMergeValues(Ops, dl); 2070 } 2071 } 2072 2073 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2074 EVT VT = Op.getValueType(); 2075 2076 if (VT == MVT::f32) 2077 return LowerFROUND32(Op, DAG); 2078 2079 if (VT == MVT::f64) 2080 return LowerFROUND64(Op, DAG); 2081 2082 llvm_unreachable("unhandled type"); 2083 } 2084 2085 // This is the the rounding method used in CUDA libdevice in C like code: 2086 // float roundf(float A) 2087 // { 2088 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2089 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2090 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2091 // } 2092 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2093 SelectionDAG &DAG) const { 2094 SDLoc SL(Op); 2095 SDValue A = Op.getOperand(0); 2096 EVT VT = Op.getValueType(); 2097 2098 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2099 2100 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2101 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2102 const int SignBitMask = 0x80000000; 2103 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2104 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2105 const int PointFiveInBits = 0x3F000000; 2106 SDValue PointFiveWithSignRaw = 2107 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2108 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2109 SDValue PointFiveWithSign = 2110 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2111 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2112 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2113 2114 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2115 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2116 SDValue IsLarge = 2117 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2118 ISD::SETOGT); 2119 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2120 2121 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2122 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2123 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2124 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2125 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2126 } 2127 2128 // The implementation of round(double) is similar to that of round(float) in 2129 // that they both separate the value range into three regions and use a method 2130 // specific to the region to round the values. However, round(double) first 2131 // calculates the round of the absolute value and then adds the sign back while 2132 // round(float) directly rounds the value with sign. 2133 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2134 SelectionDAG &DAG) const { 2135 SDLoc SL(Op); 2136 SDValue A = Op.getOperand(0); 2137 EVT VT = Op.getValueType(); 2138 2139 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2140 2141 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2142 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2143 DAG.getConstantFP(0.5, SL, VT)); 2144 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2145 2146 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2147 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2148 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2149 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2150 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2151 DAG.getConstantFP(0, SL, VT), 2152 RoundedA); 2153 2154 // Add sign to rounded_A 2155 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2156 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2157 2158 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2159 SDValue IsLarge = 2160 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2161 ISD::SETOGT); 2162 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2163 } 2164 2165 2166 2167 SDValue 2168 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2169 switch (Op.getOpcode()) { 2170 case ISD::RETURNADDR: 2171 return SDValue(); 2172 case ISD::FRAMEADDR: 2173 return SDValue(); 2174 case ISD::GlobalAddress: 2175 return LowerGlobalAddress(Op, DAG); 2176 case ISD::INTRINSIC_W_CHAIN: 2177 return Op; 2178 case ISD::BUILD_VECTOR: 2179 return LowerBUILD_VECTOR(Op, DAG); 2180 case ISD::EXTRACT_SUBVECTOR: 2181 return Op; 2182 case ISD::EXTRACT_VECTOR_ELT: 2183 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2184 case ISD::CONCAT_VECTORS: 2185 return LowerCONCAT_VECTORS(Op, DAG); 2186 case ISD::STORE: 2187 return LowerSTORE(Op, DAG); 2188 case ISD::LOAD: 2189 return LowerLOAD(Op, DAG); 2190 case ISD::SHL_PARTS: 2191 return LowerShiftLeftParts(Op, DAG); 2192 case ISD::SRA_PARTS: 2193 case ISD::SRL_PARTS: 2194 return LowerShiftRightParts(Op, DAG); 2195 case ISD::SELECT: 2196 return LowerSelect(Op, DAG); 2197 case ISD::FROUND: 2198 return LowerFROUND(Op, DAG); 2199 default: 2200 llvm_unreachable("Custom lowering not defined for operation"); 2201 } 2202 } 2203 2204 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2205 SDValue Op0 = Op->getOperand(0); 2206 SDValue Op1 = Op->getOperand(1); 2207 SDValue Op2 = Op->getOperand(2); 2208 SDLoc DL(Op.getNode()); 2209 2210 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2211 2212 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2213 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2214 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2215 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2216 2217 return Trunc; 2218 } 2219 2220 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2221 if (Op.getValueType() == MVT::i1) 2222 return LowerLOADi1(Op, DAG); 2223 2224 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2225 // loads and have to handle it here. 2226 if (Op.getValueType() == MVT::v2f16) { 2227 LoadSDNode *Load = cast<LoadSDNode>(Op); 2228 EVT MemVT = Load->getMemoryVT(); 2229 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2230 MemVT, *Load->getMemOperand())) { 2231 SDValue Ops[2]; 2232 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2233 return DAG.getMergeValues(Ops, SDLoc(Op)); 2234 } 2235 } 2236 2237 return SDValue(); 2238 } 2239 2240 // v = ld i1* addr 2241 // => 2242 // v1 = ld i8* addr (-> i16) 2243 // v = trunc i16 to i1 2244 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2245 SDNode *Node = Op.getNode(); 2246 LoadSDNode *LD = cast<LoadSDNode>(Node); 2247 SDLoc dl(Node); 2248 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2249 assert(Node->getValueType(0) == MVT::i1 && 2250 "Custom lowering for i1 load only"); 2251 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2252 LD->getPointerInfo(), LD->getAlignment(), 2253 LD->getMemOperand()->getFlags()); 2254 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2255 // The legalizer (the caller) is expecting two values from the legalized 2256 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2257 // in LegalizeDAG.cpp which also uses MergeValues. 2258 SDValue Ops[] = { result, LD->getChain() }; 2259 return DAG.getMergeValues(Ops, dl); 2260 } 2261 2262 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2263 StoreSDNode *Store = cast<StoreSDNode>(Op); 2264 EVT VT = Store->getMemoryVT(); 2265 2266 if (VT == MVT::i1) 2267 return LowerSTOREi1(Op, DAG); 2268 2269 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2270 // stores and have to handle it here. 2271 if (VT == MVT::v2f16 && 2272 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2273 VT, *Store->getMemOperand())) 2274 return expandUnalignedStore(Store, DAG); 2275 2276 if (VT.isVector()) 2277 return LowerSTOREVector(Op, DAG); 2278 2279 return SDValue(); 2280 } 2281 2282 SDValue 2283 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2284 SDNode *N = Op.getNode(); 2285 SDValue Val = N->getOperand(1); 2286 SDLoc DL(N); 2287 EVT ValVT = Val.getValueType(); 2288 2289 if (ValVT.isVector()) { 2290 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2291 // legal. We can (and should) split that into 2 stores of <2 x double> here 2292 // but I'm leaving that as a TODO for now. 2293 if (!ValVT.isSimple()) 2294 return SDValue(); 2295 switch (ValVT.getSimpleVT().SimpleTy) { 2296 default: 2297 return SDValue(); 2298 case MVT::v2i8: 2299 case MVT::v2i16: 2300 case MVT::v2i32: 2301 case MVT::v2i64: 2302 case MVT::v2f16: 2303 case MVT::v2f32: 2304 case MVT::v2f64: 2305 case MVT::v4i8: 2306 case MVT::v4i16: 2307 case MVT::v4i32: 2308 case MVT::v4f16: 2309 case MVT::v4f32: 2310 case MVT::v8f16: // <4 x f16x2> 2311 // This is a "native" vector type 2312 break; 2313 } 2314 2315 MemSDNode *MemSD = cast<MemSDNode>(N); 2316 const DataLayout &TD = DAG.getDataLayout(); 2317 2318 Align Alignment = MemSD->getAlign(); 2319 Align PrefAlign = 2320 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2321 if (Alignment < PrefAlign) { 2322 // This store is not sufficiently aligned, so bail out and let this vector 2323 // store be scalarized. Note that we may still be able to emit smaller 2324 // vector stores. For example, if we are storing a <4 x float> with an 2325 // alignment of 8, this check will fail but the legalizer will try again 2326 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2327 return SDValue(); 2328 } 2329 2330 unsigned Opcode = 0; 2331 EVT EltVT = ValVT.getVectorElementType(); 2332 unsigned NumElts = ValVT.getVectorNumElements(); 2333 2334 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2335 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2336 // stored type to i16 and propagate the "real" type as the memory type. 2337 bool NeedExt = false; 2338 if (EltVT.getSizeInBits() < 16) 2339 NeedExt = true; 2340 2341 bool StoreF16x2 = false; 2342 switch (NumElts) { 2343 default: 2344 return SDValue(); 2345 case 2: 2346 Opcode = NVPTXISD::StoreV2; 2347 break; 2348 case 4: 2349 Opcode = NVPTXISD::StoreV4; 2350 break; 2351 case 8: 2352 // v8f16 is a special case. PTX doesn't have st.v8.f16 2353 // instruction. Instead, we split the vector into v2f16 chunks and 2354 // store them with st.v4.b32. 2355 assert(EltVT == MVT::f16 && "Wrong type for the vector."); 2356 Opcode = NVPTXISD::StoreV4; 2357 StoreF16x2 = true; 2358 break; 2359 } 2360 2361 SmallVector<SDValue, 8> Ops; 2362 2363 // First is the chain 2364 Ops.push_back(N->getOperand(0)); 2365 2366 if (StoreF16x2) { 2367 // Combine f16,f16 -> v2f16 2368 NumElts /= 2; 2369 for (unsigned i = 0; i < NumElts; ++i) { 2370 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2371 DAG.getIntPtrConstant(i * 2, DL)); 2372 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2373 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2374 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2375 Ops.push_back(V2); 2376 } 2377 } else { 2378 // Then the split values 2379 for (unsigned i = 0; i < NumElts; ++i) { 2380 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2381 DAG.getIntPtrConstant(i, DL)); 2382 if (NeedExt) 2383 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2384 Ops.push_back(ExtVal); 2385 } 2386 } 2387 2388 // Then any remaining arguments 2389 Ops.append(N->op_begin() + 2, N->op_end()); 2390 2391 SDValue NewSt = 2392 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2393 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2394 2395 // return DCI.CombineTo(N, NewSt, true); 2396 return NewSt; 2397 } 2398 2399 return SDValue(); 2400 } 2401 2402 // st i1 v, addr 2403 // => 2404 // v1 = zxt v to i16 2405 // st.u8 i16, addr 2406 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2407 SDNode *Node = Op.getNode(); 2408 SDLoc dl(Node); 2409 StoreSDNode *ST = cast<StoreSDNode>(Node); 2410 SDValue Tmp1 = ST->getChain(); 2411 SDValue Tmp2 = ST->getBasePtr(); 2412 SDValue Tmp3 = ST->getValue(); 2413 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2414 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2415 SDValue Result = 2416 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2417 ST->getAlignment(), ST->getMemOperand()->getFlags()); 2418 return Result; 2419 } 2420 2421 SDValue 2422 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 2423 std::string ParamSym; 2424 raw_string_ostream ParamStr(ParamSym); 2425 2426 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 2427 ParamStr.flush(); 2428 2429 std::string *SavedStr = 2430 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 2431 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 2432 } 2433 2434 // Check to see if the kernel argument is image*_t or sampler_t 2435 2436 static bool isImageOrSamplerVal(const Value *arg, const Module *context) { 2437 static const char *const specialTypes[] = { "struct._image2d_t", 2438 "struct._image3d_t", 2439 "struct._sampler_t" }; 2440 2441 Type *Ty = arg->getType(); 2442 auto *PTy = dyn_cast<PointerType>(Ty); 2443 2444 if (!PTy) 2445 return false; 2446 2447 if (!context) 2448 return false; 2449 2450 auto *STy = dyn_cast<StructType>(PTy->getPointerElementType()); 2451 if (!STy || STy->isLiteral()) 2452 return false; 2453 2454 return llvm::is_contained(specialTypes, STy->getName()); 2455 } 2456 2457 SDValue NVPTXTargetLowering::LowerFormalArguments( 2458 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2459 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2460 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2461 MachineFunction &MF = DAG.getMachineFunction(); 2462 const DataLayout &DL = DAG.getDataLayout(); 2463 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2464 2465 const Function *F = &MF.getFunction(); 2466 const AttributeList &PAL = F->getAttributes(); 2467 const TargetLowering *TLI = STI.getTargetLowering(); 2468 2469 SDValue Root = DAG.getRoot(); 2470 std::vector<SDValue> OutChains; 2471 2472 bool isABI = (STI.getSmVersion() >= 20); 2473 assert(isABI && "Non-ABI compilation is not supported"); 2474 if (!isABI) 2475 return Chain; 2476 2477 std::vector<Type *> argTypes; 2478 std::vector<const Argument *> theArgs; 2479 for (const Argument &I : F->args()) { 2480 theArgs.push_back(&I); 2481 argTypes.push_back(I.getType()); 2482 } 2483 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2484 // Ins.size() will be larger 2485 // * if there is an aggregate argument with multiple fields (each field 2486 // showing up separately in Ins) 2487 // * if there is a vector argument with more than typical vector-length 2488 // elements (generally if more than 4) where each vector element is 2489 // individually present in Ins. 2490 // So a different index should be used for indexing into Ins. 2491 // See similar issue in LowerCall. 2492 unsigned InsIdx = 0; 2493 2494 int idx = 0; 2495 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2496 Type *Ty = argTypes[i]; 2497 2498 // If the kernel argument is image*_t or sampler_t, convert it to 2499 // a i32 constant holding the parameter position. This can later 2500 // matched in the AsmPrinter to output the correct mangled name. 2501 if (isImageOrSamplerVal( 2502 theArgs[i], 2503 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() 2504 : nullptr))) { 2505 assert(isKernelFunction(*F) && 2506 "Only kernels can have image/sampler params"); 2507 InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); 2508 continue; 2509 } 2510 2511 if (theArgs[i]->use_empty()) { 2512 // argument is dead 2513 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2514 SmallVector<EVT, 16> vtparts; 2515 2516 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2517 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2518 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2519 ++parti) { 2520 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2521 ++InsIdx; 2522 } 2523 if (vtparts.size() > 0) 2524 --InsIdx; 2525 continue; 2526 } 2527 if (Ty->isVectorTy()) { 2528 EVT ObjectVT = getValueType(DL, Ty); 2529 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2530 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2531 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2532 ++InsIdx; 2533 } 2534 if (NumRegs > 0) 2535 --InsIdx; 2536 continue; 2537 } 2538 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2539 continue; 2540 } 2541 2542 // In the following cases, assign a node order of "idx+1" 2543 // to newly created nodes. The SDNodes for params have to 2544 // appear in the same order as their order of appearance 2545 // in the original function. "idx+1" holds that order. 2546 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 2547 bool aggregateIsPacked = false; 2548 if (StructType *STy = dyn_cast<StructType>(Ty)) 2549 aggregateIsPacked = STy->isPacked(); 2550 2551 SmallVector<EVT, 16> VTs; 2552 SmallVector<uint64_t, 16> Offsets; 2553 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2554 assert(VTs.size() > 0 && "Unexpected empty type."); 2555 auto VectorInfo = 2556 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 2557 2558 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2559 int VecIdx = -1; // Index of the first element of the current vector. 2560 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2561 if (VectorInfo[parti] & PVF_FIRST) { 2562 assert(VecIdx == -1 && "Orphaned vector."); 2563 VecIdx = parti; 2564 } 2565 2566 // That's the last element of this store op. 2567 if (VectorInfo[parti] & PVF_LAST) { 2568 unsigned NumElts = parti - VecIdx + 1; 2569 EVT EltVT = VTs[parti]; 2570 // i1 is loaded/stored as i8. 2571 EVT LoadVT = EltVT; 2572 if (EltVT == MVT::i1) 2573 LoadVT = MVT::i8; 2574 else if (EltVT == MVT::v2f16) 2575 // getLoad needs a vector type, but it can't handle 2576 // vectors which contain v2f16 elements. So we must load 2577 // using i32 here and then bitcast back. 2578 LoadVT = MVT::i32; 2579 2580 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2581 SDValue VecAddr = 2582 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2583 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2584 Value *srcValue = Constant::getNullValue(PointerType::get( 2585 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2586 SDValue P = 2587 DAG.getLoad(VecVT, dl, Root, VecAddr, 2588 MachinePointerInfo(srcValue), aggregateIsPacked, 2589 MachineMemOperand::MODereferenceable | 2590 MachineMemOperand::MOInvariant); 2591 if (P.getNode()) 2592 P.getNode()->setIROrder(idx + 1); 2593 for (unsigned j = 0; j < NumElts; ++j) { 2594 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2595 DAG.getIntPtrConstant(j, dl)); 2596 // We've loaded i1 as an i8 and now must truncate it back to i1 2597 if (EltVT == MVT::i1) 2598 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2599 // v2f16 was loaded as an i32. Now we must bitcast it back. 2600 else if (EltVT == MVT::v2f16) 2601 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2602 // Extend the element if necessary (e.g. an i8 is loaded 2603 // into an i16 register) 2604 if (Ins[InsIdx].VT.isInteger() && 2605 Ins[InsIdx].VT.getFixedSizeInBits() > 2606 LoadVT.getFixedSizeInBits()) { 2607 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2608 : ISD::ZERO_EXTEND; 2609 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2610 } 2611 InVals.push_back(Elt); 2612 } 2613 2614 // Reset vector tracking state. 2615 VecIdx = -1; 2616 } 2617 ++InsIdx; 2618 } 2619 if (VTs.size() > 0) 2620 --InsIdx; 2621 continue; 2622 } 2623 2624 // Param has ByVal attribute 2625 // Return MoveParam(param symbol). 2626 // Ideally, the param symbol can be returned directly, 2627 // but when SDNode builder decides to use it in a CopyToReg(), 2628 // machine instruction fails because TargetExternalSymbol 2629 // (not lowered) is target dependent, and CopyToReg assumes 2630 // the source is lowered. 2631 EVT ObjectVT = getValueType(DL, Ty); 2632 assert(ObjectVT == Ins[InsIdx].VT && 2633 "Ins type did not match function type"); 2634 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2635 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2636 if (p.getNode()) 2637 p.getNode()->setIROrder(idx + 1); 2638 InVals.push_back(p); 2639 } 2640 2641 // Clang will check explicit VarArg and issue error if any. However, Clang 2642 // will let code with 2643 // implicit var arg like f() pass. See bug 617733. 2644 // We treat this case as if the arg list is empty. 2645 // if (F.isVarArg()) { 2646 // assert(0 && "VarArg not supported yet!"); 2647 //} 2648 2649 if (!OutChains.empty()) 2650 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2651 2652 return Chain; 2653 } 2654 2655 SDValue 2656 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2657 bool isVarArg, 2658 const SmallVectorImpl<ISD::OutputArg> &Outs, 2659 const SmallVectorImpl<SDValue> &OutVals, 2660 const SDLoc &dl, SelectionDAG &DAG) const { 2661 MachineFunction &MF = DAG.getMachineFunction(); 2662 Type *RetTy = MF.getFunction().getReturnType(); 2663 2664 bool isABI = (STI.getSmVersion() >= 20); 2665 assert(isABI && "Non-ABI compilation is not supported"); 2666 if (!isABI) 2667 return Chain; 2668 2669 const DataLayout &DL = DAG.getDataLayout(); 2670 SmallVector<EVT, 16> VTs; 2671 SmallVector<uint64_t, 16> Offsets; 2672 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2673 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2674 2675 auto VectorInfo = VectorizePTXValueVTs( 2676 VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); 2677 2678 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2679 // 32-bits are sign extended or zero extended, depending on whether 2680 // they are signed or unsigned types. 2681 bool ExtendIntegerRetVal = 2682 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2683 2684 SmallVector<SDValue, 6> StoreOperands; 2685 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2686 // New load/store. Record chain and offset operands. 2687 if (VectorInfo[i] & PVF_FIRST) { 2688 assert(StoreOperands.empty() && "Orphaned operand list."); 2689 StoreOperands.push_back(Chain); 2690 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2691 } 2692 2693 SDValue RetVal = OutVals[i]; 2694 if (ExtendIntegerRetVal) { 2695 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2696 : ISD::ZERO_EXTEND, 2697 dl, MVT::i32, RetVal); 2698 } else if (RetVal.getValueSizeInBits() < 16) { 2699 // Use 16-bit registers for small load-stores as it's the 2700 // smallest general purpose register size supported by NVPTX. 2701 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2702 } 2703 2704 // Record the value to return. 2705 StoreOperands.push_back(RetVal); 2706 2707 // That's the last element of this store op. 2708 if (VectorInfo[i] & PVF_LAST) { 2709 NVPTXISD::NodeType Op; 2710 unsigned NumElts = StoreOperands.size() - 2; 2711 switch (NumElts) { 2712 case 1: 2713 Op = NVPTXISD::StoreRetval; 2714 break; 2715 case 2: 2716 Op = NVPTXISD::StoreRetvalV2; 2717 break; 2718 case 4: 2719 Op = NVPTXISD::StoreRetvalV4; 2720 break; 2721 default: 2722 llvm_unreachable("Invalid vector info."); 2723 } 2724 2725 // Adjust type of load/store op if we've extended the scalar 2726 // return value. 2727 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2728 Chain = DAG.getMemIntrinsicNode( 2729 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 2730 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 2731 // Cleanup vector state. 2732 StoreOperands.clear(); 2733 } 2734 } 2735 2736 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2737 } 2738 2739 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2740 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2741 SelectionDAG &DAG) const { 2742 if (Constraint.length() > 1) 2743 return; 2744 else 2745 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2746 } 2747 2748 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2749 switch (Intrinsic) { 2750 default: 2751 return 0; 2752 2753 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2754 return NVPTXISD::Tex1DFloatS32; 2755 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2756 return NVPTXISD::Tex1DFloatFloat; 2757 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2758 return NVPTXISD::Tex1DFloatFloatLevel; 2759 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2760 return NVPTXISD::Tex1DFloatFloatGrad; 2761 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2762 return NVPTXISD::Tex1DS32S32; 2763 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2764 return NVPTXISD::Tex1DS32Float; 2765 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2766 return NVPTXISD::Tex1DS32FloatLevel; 2767 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2768 return NVPTXISD::Tex1DS32FloatGrad; 2769 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2770 return NVPTXISD::Tex1DU32S32; 2771 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2772 return NVPTXISD::Tex1DU32Float; 2773 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2774 return NVPTXISD::Tex1DU32FloatLevel; 2775 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2776 return NVPTXISD::Tex1DU32FloatGrad; 2777 2778 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2779 return NVPTXISD::Tex1DArrayFloatS32; 2780 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2781 return NVPTXISD::Tex1DArrayFloatFloat; 2782 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2783 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2784 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2785 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2786 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2787 return NVPTXISD::Tex1DArrayS32S32; 2788 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2789 return NVPTXISD::Tex1DArrayS32Float; 2790 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2791 return NVPTXISD::Tex1DArrayS32FloatLevel; 2792 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2793 return NVPTXISD::Tex1DArrayS32FloatGrad; 2794 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2795 return NVPTXISD::Tex1DArrayU32S32; 2796 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2797 return NVPTXISD::Tex1DArrayU32Float; 2798 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2799 return NVPTXISD::Tex1DArrayU32FloatLevel; 2800 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2801 return NVPTXISD::Tex1DArrayU32FloatGrad; 2802 2803 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2804 return NVPTXISD::Tex2DFloatS32; 2805 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2806 return NVPTXISD::Tex2DFloatFloat; 2807 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2808 return NVPTXISD::Tex2DFloatFloatLevel; 2809 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2810 return NVPTXISD::Tex2DFloatFloatGrad; 2811 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2812 return NVPTXISD::Tex2DS32S32; 2813 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2814 return NVPTXISD::Tex2DS32Float; 2815 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2816 return NVPTXISD::Tex2DS32FloatLevel; 2817 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2818 return NVPTXISD::Tex2DS32FloatGrad; 2819 case Intrinsic::nvvm_tex_2d_v4u32_s32: 2820 return NVPTXISD::Tex2DU32S32; 2821 case Intrinsic::nvvm_tex_2d_v4u32_f32: 2822 return NVPTXISD::Tex2DU32Float; 2823 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 2824 return NVPTXISD::Tex2DU32FloatLevel; 2825 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 2826 return NVPTXISD::Tex2DU32FloatGrad; 2827 2828 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 2829 return NVPTXISD::Tex2DArrayFloatS32; 2830 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2831 return NVPTXISD::Tex2DArrayFloatFloat; 2832 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2833 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2834 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2835 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2836 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 2837 return NVPTXISD::Tex2DArrayS32S32; 2838 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 2839 return NVPTXISD::Tex2DArrayS32Float; 2840 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 2841 return NVPTXISD::Tex2DArrayS32FloatLevel; 2842 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 2843 return NVPTXISD::Tex2DArrayS32FloatGrad; 2844 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 2845 return NVPTXISD::Tex2DArrayU32S32; 2846 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 2847 return NVPTXISD::Tex2DArrayU32Float; 2848 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 2849 return NVPTXISD::Tex2DArrayU32FloatLevel; 2850 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 2851 return NVPTXISD::Tex2DArrayU32FloatGrad; 2852 2853 case Intrinsic::nvvm_tex_3d_v4f32_s32: 2854 return NVPTXISD::Tex3DFloatS32; 2855 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2856 return NVPTXISD::Tex3DFloatFloat; 2857 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2858 return NVPTXISD::Tex3DFloatFloatLevel; 2859 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2860 return NVPTXISD::Tex3DFloatFloatGrad; 2861 case Intrinsic::nvvm_tex_3d_v4s32_s32: 2862 return NVPTXISD::Tex3DS32S32; 2863 case Intrinsic::nvvm_tex_3d_v4s32_f32: 2864 return NVPTXISD::Tex3DS32Float; 2865 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 2866 return NVPTXISD::Tex3DS32FloatLevel; 2867 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 2868 return NVPTXISD::Tex3DS32FloatGrad; 2869 case Intrinsic::nvvm_tex_3d_v4u32_s32: 2870 return NVPTXISD::Tex3DU32S32; 2871 case Intrinsic::nvvm_tex_3d_v4u32_f32: 2872 return NVPTXISD::Tex3DU32Float; 2873 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 2874 return NVPTXISD::Tex3DU32FloatLevel; 2875 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 2876 return NVPTXISD::Tex3DU32FloatGrad; 2877 2878 case Intrinsic::nvvm_tex_cube_v4f32_f32: 2879 return NVPTXISD::TexCubeFloatFloat; 2880 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 2881 return NVPTXISD::TexCubeFloatFloatLevel; 2882 case Intrinsic::nvvm_tex_cube_v4s32_f32: 2883 return NVPTXISD::TexCubeS32Float; 2884 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 2885 return NVPTXISD::TexCubeS32FloatLevel; 2886 case Intrinsic::nvvm_tex_cube_v4u32_f32: 2887 return NVPTXISD::TexCubeU32Float; 2888 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 2889 return NVPTXISD::TexCubeU32FloatLevel; 2890 2891 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 2892 return NVPTXISD::TexCubeArrayFloatFloat; 2893 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 2894 return NVPTXISD::TexCubeArrayFloatFloatLevel; 2895 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 2896 return NVPTXISD::TexCubeArrayS32Float; 2897 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 2898 return NVPTXISD::TexCubeArrayS32FloatLevel; 2899 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 2900 return NVPTXISD::TexCubeArrayU32Float; 2901 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 2902 return NVPTXISD::TexCubeArrayU32FloatLevel; 2903 2904 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 2905 return NVPTXISD::Tld4R2DFloatFloat; 2906 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 2907 return NVPTXISD::Tld4G2DFloatFloat; 2908 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 2909 return NVPTXISD::Tld4B2DFloatFloat; 2910 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 2911 return NVPTXISD::Tld4A2DFloatFloat; 2912 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 2913 return NVPTXISD::Tld4R2DS64Float; 2914 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 2915 return NVPTXISD::Tld4G2DS64Float; 2916 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 2917 return NVPTXISD::Tld4B2DS64Float; 2918 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 2919 return NVPTXISD::Tld4A2DS64Float; 2920 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 2921 return NVPTXISD::Tld4R2DU64Float; 2922 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 2923 return NVPTXISD::Tld4G2DU64Float; 2924 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 2925 return NVPTXISD::Tld4B2DU64Float; 2926 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 2927 return NVPTXISD::Tld4A2DU64Float; 2928 2929 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 2930 return NVPTXISD::TexUnified1DFloatS32; 2931 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 2932 return NVPTXISD::TexUnified1DFloatFloat; 2933 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 2934 return NVPTXISD::TexUnified1DFloatFloatLevel; 2935 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 2936 return NVPTXISD::TexUnified1DFloatFloatGrad; 2937 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 2938 return NVPTXISD::TexUnified1DS32S32; 2939 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 2940 return NVPTXISD::TexUnified1DS32Float; 2941 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 2942 return NVPTXISD::TexUnified1DS32FloatLevel; 2943 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 2944 return NVPTXISD::TexUnified1DS32FloatGrad; 2945 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 2946 return NVPTXISD::TexUnified1DU32S32; 2947 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 2948 return NVPTXISD::TexUnified1DU32Float; 2949 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 2950 return NVPTXISD::TexUnified1DU32FloatLevel; 2951 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 2952 return NVPTXISD::TexUnified1DU32FloatGrad; 2953 2954 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 2955 return NVPTXISD::TexUnified1DArrayFloatS32; 2956 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 2957 return NVPTXISD::TexUnified1DArrayFloatFloat; 2958 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 2959 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 2960 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 2961 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 2962 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 2963 return NVPTXISD::TexUnified1DArrayS32S32; 2964 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 2965 return NVPTXISD::TexUnified1DArrayS32Float; 2966 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 2967 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 2968 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 2969 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 2970 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 2971 return NVPTXISD::TexUnified1DArrayU32S32; 2972 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 2973 return NVPTXISD::TexUnified1DArrayU32Float; 2974 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 2975 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 2976 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 2977 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 2978 2979 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 2980 return NVPTXISD::TexUnified2DFloatS32; 2981 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 2982 return NVPTXISD::TexUnified2DFloatFloat; 2983 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 2984 return NVPTXISD::TexUnified2DFloatFloatLevel; 2985 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 2986 return NVPTXISD::TexUnified2DFloatFloatGrad; 2987 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 2988 return NVPTXISD::TexUnified2DS32S32; 2989 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 2990 return NVPTXISD::TexUnified2DS32Float; 2991 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 2992 return NVPTXISD::TexUnified2DS32FloatLevel; 2993 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 2994 return NVPTXISD::TexUnified2DS32FloatGrad; 2995 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 2996 return NVPTXISD::TexUnified2DU32S32; 2997 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 2998 return NVPTXISD::TexUnified2DU32Float; 2999 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3000 return NVPTXISD::TexUnified2DU32FloatLevel; 3001 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3002 return NVPTXISD::TexUnified2DU32FloatGrad; 3003 3004 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3005 return NVPTXISD::TexUnified2DArrayFloatS32; 3006 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3007 return NVPTXISD::TexUnified2DArrayFloatFloat; 3008 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3009 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3010 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3011 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3012 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3013 return NVPTXISD::TexUnified2DArrayS32S32; 3014 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3015 return NVPTXISD::TexUnified2DArrayS32Float; 3016 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3017 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3018 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3019 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3020 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3021 return NVPTXISD::TexUnified2DArrayU32S32; 3022 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3023 return NVPTXISD::TexUnified2DArrayU32Float; 3024 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3025 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3026 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3027 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3028 3029 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3030 return NVPTXISD::TexUnified3DFloatS32; 3031 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3032 return NVPTXISD::TexUnified3DFloatFloat; 3033 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3034 return NVPTXISD::TexUnified3DFloatFloatLevel; 3035 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3036 return NVPTXISD::TexUnified3DFloatFloatGrad; 3037 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3038 return NVPTXISD::TexUnified3DS32S32; 3039 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3040 return NVPTXISD::TexUnified3DS32Float; 3041 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3042 return NVPTXISD::TexUnified3DS32FloatLevel; 3043 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3044 return NVPTXISD::TexUnified3DS32FloatGrad; 3045 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3046 return NVPTXISD::TexUnified3DU32S32; 3047 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3048 return NVPTXISD::TexUnified3DU32Float; 3049 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3050 return NVPTXISD::TexUnified3DU32FloatLevel; 3051 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3052 return NVPTXISD::TexUnified3DU32FloatGrad; 3053 3054 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3055 return NVPTXISD::TexUnifiedCubeFloatFloat; 3056 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3057 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3058 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3059 return NVPTXISD::TexUnifiedCubeS32Float; 3060 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3061 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3062 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3063 return NVPTXISD::TexUnifiedCubeU32Float; 3064 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3065 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3066 3067 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3068 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3069 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3070 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3071 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3072 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3073 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3074 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3075 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3076 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3077 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3078 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3079 3080 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3081 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3082 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3083 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3084 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3085 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3086 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3087 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3088 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3089 return NVPTXISD::Tld4UnifiedR2DS64Float; 3090 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3091 return NVPTXISD::Tld4UnifiedG2DS64Float; 3092 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3093 return NVPTXISD::Tld4UnifiedB2DS64Float; 3094 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3095 return NVPTXISD::Tld4UnifiedA2DS64Float; 3096 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3097 return NVPTXISD::Tld4UnifiedR2DU64Float; 3098 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3099 return NVPTXISD::Tld4UnifiedG2DU64Float; 3100 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3101 return NVPTXISD::Tld4UnifiedB2DU64Float; 3102 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3103 return NVPTXISD::Tld4UnifiedA2DU64Float; 3104 } 3105 } 3106 3107 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3108 switch (Intrinsic) { 3109 default: 3110 return 0; 3111 case Intrinsic::nvvm_suld_1d_i8_clamp: 3112 return NVPTXISD::Suld1DI8Clamp; 3113 case Intrinsic::nvvm_suld_1d_i16_clamp: 3114 return NVPTXISD::Suld1DI16Clamp; 3115 case Intrinsic::nvvm_suld_1d_i32_clamp: 3116 return NVPTXISD::Suld1DI32Clamp; 3117 case Intrinsic::nvvm_suld_1d_i64_clamp: 3118 return NVPTXISD::Suld1DI64Clamp; 3119 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3120 return NVPTXISD::Suld1DV2I8Clamp; 3121 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3122 return NVPTXISD::Suld1DV2I16Clamp; 3123 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3124 return NVPTXISD::Suld1DV2I32Clamp; 3125 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3126 return NVPTXISD::Suld1DV2I64Clamp; 3127 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3128 return NVPTXISD::Suld1DV4I8Clamp; 3129 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3130 return NVPTXISD::Suld1DV4I16Clamp; 3131 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3132 return NVPTXISD::Suld1DV4I32Clamp; 3133 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3134 return NVPTXISD::Suld1DArrayI8Clamp; 3135 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3136 return NVPTXISD::Suld1DArrayI16Clamp; 3137 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3138 return NVPTXISD::Suld1DArrayI32Clamp; 3139 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3140 return NVPTXISD::Suld1DArrayI64Clamp; 3141 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3142 return NVPTXISD::Suld1DArrayV2I8Clamp; 3143 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3144 return NVPTXISD::Suld1DArrayV2I16Clamp; 3145 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3146 return NVPTXISD::Suld1DArrayV2I32Clamp; 3147 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3148 return NVPTXISD::Suld1DArrayV2I64Clamp; 3149 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3150 return NVPTXISD::Suld1DArrayV4I8Clamp; 3151 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3152 return NVPTXISD::Suld1DArrayV4I16Clamp; 3153 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3154 return NVPTXISD::Suld1DArrayV4I32Clamp; 3155 case Intrinsic::nvvm_suld_2d_i8_clamp: 3156 return NVPTXISD::Suld2DI8Clamp; 3157 case Intrinsic::nvvm_suld_2d_i16_clamp: 3158 return NVPTXISD::Suld2DI16Clamp; 3159 case Intrinsic::nvvm_suld_2d_i32_clamp: 3160 return NVPTXISD::Suld2DI32Clamp; 3161 case Intrinsic::nvvm_suld_2d_i64_clamp: 3162 return NVPTXISD::Suld2DI64Clamp; 3163 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3164 return NVPTXISD::Suld2DV2I8Clamp; 3165 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3166 return NVPTXISD::Suld2DV2I16Clamp; 3167 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3168 return NVPTXISD::Suld2DV2I32Clamp; 3169 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3170 return NVPTXISD::Suld2DV2I64Clamp; 3171 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3172 return NVPTXISD::Suld2DV4I8Clamp; 3173 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3174 return NVPTXISD::Suld2DV4I16Clamp; 3175 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3176 return NVPTXISD::Suld2DV4I32Clamp; 3177 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3178 return NVPTXISD::Suld2DArrayI8Clamp; 3179 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3180 return NVPTXISD::Suld2DArrayI16Clamp; 3181 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3182 return NVPTXISD::Suld2DArrayI32Clamp; 3183 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3184 return NVPTXISD::Suld2DArrayI64Clamp; 3185 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3186 return NVPTXISD::Suld2DArrayV2I8Clamp; 3187 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3188 return NVPTXISD::Suld2DArrayV2I16Clamp; 3189 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3190 return NVPTXISD::Suld2DArrayV2I32Clamp; 3191 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3192 return NVPTXISD::Suld2DArrayV2I64Clamp; 3193 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3194 return NVPTXISD::Suld2DArrayV4I8Clamp; 3195 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3196 return NVPTXISD::Suld2DArrayV4I16Clamp; 3197 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3198 return NVPTXISD::Suld2DArrayV4I32Clamp; 3199 case Intrinsic::nvvm_suld_3d_i8_clamp: 3200 return NVPTXISD::Suld3DI8Clamp; 3201 case Intrinsic::nvvm_suld_3d_i16_clamp: 3202 return NVPTXISD::Suld3DI16Clamp; 3203 case Intrinsic::nvvm_suld_3d_i32_clamp: 3204 return NVPTXISD::Suld3DI32Clamp; 3205 case Intrinsic::nvvm_suld_3d_i64_clamp: 3206 return NVPTXISD::Suld3DI64Clamp; 3207 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3208 return NVPTXISD::Suld3DV2I8Clamp; 3209 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3210 return NVPTXISD::Suld3DV2I16Clamp; 3211 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3212 return NVPTXISD::Suld3DV2I32Clamp; 3213 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3214 return NVPTXISD::Suld3DV2I64Clamp; 3215 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3216 return NVPTXISD::Suld3DV4I8Clamp; 3217 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3218 return NVPTXISD::Suld3DV4I16Clamp; 3219 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3220 return NVPTXISD::Suld3DV4I32Clamp; 3221 case Intrinsic::nvvm_suld_1d_i8_trap: 3222 return NVPTXISD::Suld1DI8Trap; 3223 case Intrinsic::nvvm_suld_1d_i16_trap: 3224 return NVPTXISD::Suld1DI16Trap; 3225 case Intrinsic::nvvm_suld_1d_i32_trap: 3226 return NVPTXISD::Suld1DI32Trap; 3227 case Intrinsic::nvvm_suld_1d_i64_trap: 3228 return NVPTXISD::Suld1DI64Trap; 3229 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3230 return NVPTXISD::Suld1DV2I8Trap; 3231 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3232 return NVPTXISD::Suld1DV2I16Trap; 3233 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3234 return NVPTXISD::Suld1DV2I32Trap; 3235 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3236 return NVPTXISD::Suld1DV2I64Trap; 3237 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3238 return NVPTXISD::Suld1DV4I8Trap; 3239 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3240 return NVPTXISD::Suld1DV4I16Trap; 3241 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3242 return NVPTXISD::Suld1DV4I32Trap; 3243 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3244 return NVPTXISD::Suld1DArrayI8Trap; 3245 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3246 return NVPTXISD::Suld1DArrayI16Trap; 3247 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3248 return NVPTXISD::Suld1DArrayI32Trap; 3249 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3250 return NVPTXISD::Suld1DArrayI64Trap; 3251 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3252 return NVPTXISD::Suld1DArrayV2I8Trap; 3253 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3254 return NVPTXISD::Suld1DArrayV2I16Trap; 3255 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3256 return NVPTXISD::Suld1DArrayV2I32Trap; 3257 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3258 return NVPTXISD::Suld1DArrayV2I64Trap; 3259 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3260 return NVPTXISD::Suld1DArrayV4I8Trap; 3261 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3262 return NVPTXISD::Suld1DArrayV4I16Trap; 3263 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3264 return NVPTXISD::Suld1DArrayV4I32Trap; 3265 case Intrinsic::nvvm_suld_2d_i8_trap: 3266 return NVPTXISD::Suld2DI8Trap; 3267 case Intrinsic::nvvm_suld_2d_i16_trap: 3268 return NVPTXISD::Suld2DI16Trap; 3269 case Intrinsic::nvvm_suld_2d_i32_trap: 3270 return NVPTXISD::Suld2DI32Trap; 3271 case Intrinsic::nvvm_suld_2d_i64_trap: 3272 return NVPTXISD::Suld2DI64Trap; 3273 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3274 return NVPTXISD::Suld2DV2I8Trap; 3275 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3276 return NVPTXISD::Suld2DV2I16Trap; 3277 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3278 return NVPTXISD::Suld2DV2I32Trap; 3279 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3280 return NVPTXISD::Suld2DV2I64Trap; 3281 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3282 return NVPTXISD::Suld2DV4I8Trap; 3283 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3284 return NVPTXISD::Suld2DV4I16Trap; 3285 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3286 return NVPTXISD::Suld2DV4I32Trap; 3287 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3288 return NVPTXISD::Suld2DArrayI8Trap; 3289 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3290 return NVPTXISD::Suld2DArrayI16Trap; 3291 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3292 return NVPTXISD::Suld2DArrayI32Trap; 3293 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3294 return NVPTXISD::Suld2DArrayI64Trap; 3295 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3296 return NVPTXISD::Suld2DArrayV2I8Trap; 3297 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3298 return NVPTXISD::Suld2DArrayV2I16Trap; 3299 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3300 return NVPTXISD::Suld2DArrayV2I32Trap; 3301 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3302 return NVPTXISD::Suld2DArrayV2I64Trap; 3303 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3304 return NVPTXISD::Suld2DArrayV4I8Trap; 3305 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3306 return NVPTXISD::Suld2DArrayV4I16Trap; 3307 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3308 return NVPTXISD::Suld2DArrayV4I32Trap; 3309 case Intrinsic::nvvm_suld_3d_i8_trap: 3310 return NVPTXISD::Suld3DI8Trap; 3311 case Intrinsic::nvvm_suld_3d_i16_trap: 3312 return NVPTXISD::Suld3DI16Trap; 3313 case Intrinsic::nvvm_suld_3d_i32_trap: 3314 return NVPTXISD::Suld3DI32Trap; 3315 case Intrinsic::nvvm_suld_3d_i64_trap: 3316 return NVPTXISD::Suld3DI64Trap; 3317 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3318 return NVPTXISD::Suld3DV2I8Trap; 3319 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3320 return NVPTXISD::Suld3DV2I16Trap; 3321 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3322 return NVPTXISD::Suld3DV2I32Trap; 3323 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3324 return NVPTXISD::Suld3DV2I64Trap; 3325 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3326 return NVPTXISD::Suld3DV4I8Trap; 3327 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3328 return NVPTXISD::Suld3DV4I16Trap; 3329 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3330 return NVPTXISD::Suld3DV4I32Trap; 3331 case Intrinsic::nvvm_suld_1d_i8_zero: 3332 return NVPTXISD::Suld1DI8Zero; 3333 case Intrinsic::nvvm_suld_1d_i16_zero: 3334 return NVPTXISD::Suld1DI16Zero; 3335 case Intrinsic::nvvm_suld_1d_i32_zero: 3336 return NVPTXISD::Suld1DI32Zero; 3337 case Intrinsic::nvvm_suld_1d_i64_zero: 3338 return NVPTXISD::Suld1DI64Zero; 3339 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3340 return NVPTXISD::Suld1DV2I8Zero; 3341 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3342 return NVPTXISD::Suld1DV2I16Zero; 3343 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3344 return NVPTXISD::Suld1DV2I32Zero; 3345 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3346 return NVPTXISD::Suld1DV2I64Zero; 3347 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3348 return NVPTXISD::Suld1DV4I8Zero; 3349 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3350 return NVPTXISD::Suld1DV4I16Zero; 3351 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3352 return NVPTXISD::Suld1DV4I32Zero; 3353 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3354 return NVPTXISD::Suld1DArrayI8Zero; 3355 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3356 return NVPTXISD::Suld1DArrayI16Zero; 3357 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3358 return NVPTXISD::Suld1DArrayI32Zero; 3359 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3360 return NVPTXISD::Suld1DArrayI64Zero; 3361 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3362 return NVPTXISD::Suld1DArrayV2I8Zero; 3363 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3364 return NVPTXISD::Suld1DArrayV2I16Zero; 3365 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3366 return NVPTXISD::Suld1DArrayV2I32Zero; 3367 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3368 return NVPTXISD::Suld1DArrayV2I64Zero; 3369 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3370 return NVPTXISD::Suld1DArrayV4I8Zero; 3371 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3372 return NVPTXISD::Suld1DArrayV4I16Zero; 3373 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3374 return NVPTXISD::Suld1DArrayV4I32Zero; 3375 case Intrinsic::nvvm_suld_2d_i8_zero: 3376 return NVPTXISD::Suld2DI8Zero; 3377 case Intrinsic::nvvm_suld_2d_i16_zero: 3378 return NVPTXISD::Suld2DI16Zero; 3379 case Intrinsic::nvvm_suld_2d_i32_zero: 3380 return NVPTXISD::Suld2DI32Zero; 3381 case Intrinsic::nvvm_suld_2d_i64_zero: 3382 return NVPTXISD::Suld2DI64Zero; 3383 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3384 return NVPTXISD::Suld2DV2I8Zero; 3385 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3386 return NVPTXISD::Suld2DV2I16Zero; 3387 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3388 return NVPTXISD::Suld2DV2I32Zero; 3389 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3390 return NVPTXISD::Suld2DV2I64Zero; 3391 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3392 return NVPTXISD::Suld2DV4I8Zero; 3393 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3394 return NVPTXISD::Suld2DV4I16Zero; 3395 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3396 return NVPTXISD::Suld2DV4I32Zero; 3397 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3398 return NVPTXISD::Suld2DArrayI8Zero; 3399 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3400 return NVPTXISD::Suld2DArrayI16Zero; 3401 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3402 return NVPTXISD::Suld2DArrayI32Zero; 3403 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3404 return NVPTXISD::Suld2DArrayI64Zero; 3405 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3406 return NVPTXISD::Suld2DArrayV2I8Zero; 3407 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3408 return NVPTXISD::Suld2DArrayV2I16Zero; 3409 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3410 return NVPTXISD::Suld2DArrayV2I32Zero; 3411 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3412 return NVPTXISD::Suld2DArrayV2I64Zero; 3413 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3414 return NVPTXISD::Suld2DArrayV4I8Zero; 3415 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3416 return NVPTXISD::Suld2DArrayV4I16Zero; 3417 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3418 return NVPTXISD::Suld2DArrayV4I32Zero; 3419 case Intrinsic::nvvm_suld_3d_i8_zero: 3420 return NVPTXISD::Suld3DI8Zero; 3421 case Intrinsic::nvvm_suld_3d_i16_zero: 3422 return NVPTXISD::Suld3DI16Zero; 3423 case Intrinsic::nvvm_suld_3d_i32_zero: 3424 return NVPTXISD::Suld3DI32Zero; 3425 case Intrinsic::nvvm_suld_3d_i64_zero: 3426 return NVPTXISD::Suld3DI64Zero; 3427 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3428 return NVPTXISD::Suld3DV2I8Zero; 3429 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3430 return NVPTXISD::Suld3DV2I16Zero; 3431 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3432 return NVPTXISD::Suld3DV2I32Zero; 3433 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3434 return NVPTXISD::Suld3DV2I64Zero; 3435 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3436 return NVPTXISD::Suld3DV4I8Zero; 3437 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3438 return NVPTXISD::Suld3DV4I16Zero; 3439 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3440 return NVPTXISD::Suld3DV4I32Zero; 3441 } 3442 } 3443 3444 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3445 // TgtMemIntrinsic 3446 // because we need the information that is only available in the "Value" type 3447 // of destination 3448 // pointer. In particular, the address space information. 3449 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3450 IntrinsicInfo &Info, const CallInst &I, 3451 MachineFunction &MF, unsigned Intrinsic) const { 3452 switch (Intrinsic) { 3453 default: 3454 return false; 3455 case Intrinsic::nvvm_match_all_sync_i32p: 3456 case Intrinsic::nvvm_match_all_sync_i64p: 3457 Info.opc = ISD::INTRINSIC_W_CHAIN; 3458 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3459 // in order to model data exchange with other threads, but perform no real 3460 // memory accesses. 3461 Info.memVT = MVT::i1; 3462 3463 // Our result depends on both our and other thread's arguments. 3464 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3465 return true; 3466 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3467 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3468 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3469 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3470 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3471 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3472 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3473 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3474 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3475 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3476 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3477 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3478 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3479 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3480 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3481 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3482 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3483 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3484 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3485 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3486 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3487 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3488 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3489 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3490 Info.opc = ISD::INTRINSIC_W_CHAIN; 3491 Info.memVT = MVT::v8f16; 3492 Info.ptrVal = I.getArgOperand(0); 3493 Info.offset = 0; 3494 Info.flags = MachineMemOperand::MOLoad; 3495 Info.align = Align(16); 3496 return true; 3497 } 3498 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3499 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3500 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3501 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3502 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3503 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3505 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3506 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 3507 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 3508 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 3509 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 3510 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3513 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3514 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3515 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3516 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3517 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 3518 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 3519 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 3520 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 3521 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 3522 Info.opc = ISD::INTRINSIC_W_CHAIN; 3523 Info.memVT = MVT::v2i32; 3524 Info.ptrVal = I.getArgOperand(0); 3525 Info.offset = 0; 3526 Info.flags = MachineMemOperand::MOLoad; 3527 Info.align = Align(8); 3528 return true; 3529 } 3530 3531 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3532 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3533 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3534 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3535 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3536 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3537 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3538 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3539 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 3540 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 3541 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 3542 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 3543 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 3544 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 3545 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 3546 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 3547 3548 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3549 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3550 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3551 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3552 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3553 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3554 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3555 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 3556 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 3557 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 3558 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 3559 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 3560 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 3561 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 3562 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 3563 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 3564 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 3565 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 3566 Info.opc = ISD::INTRINSIC_W_CHAIN; 3567 Info.memVT = MVT::v4i32; 3568 Info.ptrVal = I.getArgOperand(0); 3569 Info.offset = 0; 3570 Info.flags = MachineMemOperand::MOLoad; 3571 Info.align = Align(16); 3572 return true; 3573 } 3574 3575 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3576 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3577 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3578 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3579 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3580 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3581 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3582 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3583 3584 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3585 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3586 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3587 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3588 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3589 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3590 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3591 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3592 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3593 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3594 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3595 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3596 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3597 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3598 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3599 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3600 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3601 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3602 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3603 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 3604 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 3605 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 3606 Info.opc = ISD::INTRINSIC_W_CHAIN; 3607 Info.memVT = MVT::i32; 3608 Info.ptrVal = I.getArgOperand(0); 3609 Info.offset = 0; 3610 Info.flags = MachineMemOperand::MOLoad; 3611 Info.align = Align(4); 3612 return true; 3613 } 3614 3615 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3617 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3618 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3619 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3621 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3622 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3623 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3625 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3626 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3627 Info.opc = ISD::INTRINSIC_W_CHAIN; 3628 Info.memVT = MVT::v4f16; 3629 Info.ptrVal = I.getArgOperand(0); 3630 Info.offset = 0; 3631 Info.flags = MachineMemOperand::MOLoad; 3632 Info.align = Align(16); 3633 return true; 3634 } 3635 3636 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3637 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3638 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3639 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3640 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3641 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3642 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3643 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3644 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3645 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3646 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3647 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 3648 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 3649 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 3650 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 3651 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 3652 Info.opc = ISD::INTRINSIC_W_CHAIN; 3653 Info.memVT = MVT::v8f32; 3654 Info.ptrVal = I.getArgOperand(0); 3655 Info.offset = 0; 3656 Info.flags = MachineMemOperand::MOLoad; 3657 Info.align = Align(16); 3658 return true; 3659 } 3660 3661 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 3662 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 3663 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 3664 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 3665 3666 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 3667 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 3668 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 3669 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 3670 3671 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3672 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3673 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3674 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3675 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3676 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3677 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3678 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3679 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3680 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3681 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3682 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3683 Info.opc = ISD::INTRINSIC_W_CHAIN; 3684 Info.memVT = MVT::v8i32; 3685 Info.ptrVal = I.getArgOperand(0); 3686 Info.offset = 0; 3687 Info.flags = MachineMemOperand::MOLoad; 3688 Info.align = Align(16); 3689 return true; 3690 } 3691 3692 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3693 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3694 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3695 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3696 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3697 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3698 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3699 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 3700 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 3701 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 3702 Info.opc = ISD::INTRINSIC_W_CHAIN; 3703 Info.memVT = MVT::v2i32; 3704 Info.ptrVal = I.getArgOperand(0); 3705 Info.offset = 0; 3706 Info.flags = MachineMemOperand::MOLoad; 3707 Info.align = Align(8); 3708 return true; 3709 } 3710 3711 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 3712 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 3713 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 3714 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 3715 3716 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 3717 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 3718 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 3719 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 3720 Info.opc = ISD::INTRINSIC_W_CHAIN; 3721 Info.memVT = MVT::f64; 3722 Info.ptrVal = I.getArgOperand(0); 3723 Info.offset = 0; 3724 Info.flags = MachineMemOperand::MOLoad; 3725 Info.align = Align(8); 3726 return true; 3727 } 3728 3729 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 3730 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 3731 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 3732 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 3733 Info.opc = ISD::INTRINSIC_W_CHAIN; 3734 Info.memVT = MVT::v2f64; 3735 Info.ptrVal = I.getArgOperand(0); 3736 Info.offset = 0; 3737 Info.flags = MachineMemOperand::MOLoad; 3738 Info.align = Align(16); 3739 return true; 3740 } 3741 3742 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3743 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3744 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3745 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3746 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3747 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3748 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3749 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3750 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3751 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3752 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3753 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3754 Info.opc = ISD::INTRINSIC_VOID; 3755 Info.memVT = MVT::v4f16; 3756 Info.ptrVal = I.getArgOperand(0); 3757 Info.offset = 0; 3758 Info.flags = MachineMemOperand::MOStore; 3759 Info.align = Align(16); 3760 return true; 3761 } 3762 3763 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3764 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3765 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3766 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3767 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3768 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3769 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3770 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3771 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3772 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3773 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3774 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 3775 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 3776 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 3777 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 3778 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 3779 Info.opc = ISD::INTRINSIC_VOID; 3780 Info.memVT = MVT::v8f32; 3781 Info.ptrVal = I.getArgOperand(0); 3782 Info.offset = 0; 3783 Info.flags = MachineMemOperand::MOStore; 3784 Info.align = Align(16); 3785 return true; 3786 } 3787 3788 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3789 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3790 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3791 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3792 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3793 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3794 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3795 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3796 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3797 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3798 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3799 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3800 Info.opc = ISD::INTRINSIC_VOID; 3801 Info.memVT = MVT::v8i32; 3802 Info.ptrVal = I.getArgOperand(0); 3803 Info.offset = 0; 3804 Info.flags = MachineMemOperand::MOStore; 3805 Info.align = Align(16); 3806 return true; 3807 } 3808 3809 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3810 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3811 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3812 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3813 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3814 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3815 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3816 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3817 Info.opc = ISD::INTRINSIC_VOID; 3818 Info.memVT = MVT::v2i32; 3819 Info.ptrVal = I.getArgOperand(0); 3820 Info.offset = 0; 3821 Info.flags = MachineMemOperand::MOStore; 3822 Info.align = Align(8); 3823 return true; 3824 } 3825 3826 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 3827 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 3828 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 3829 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 3830 Info.opc = ISD::INTRINSIC_VOID; 3831 Info.memVT = MVT::v2f64; 3832 Info.ptrVal = I.getArgOperand(0); 3833 Info.offset = 0; 3834 Info.flags = MachineMemOperand::MOStore; 3835 Info.align = Align(16); 3836 return true; 3837 } 3838 3839 case Intrinsic::nvvm_atomic_load_inc_32: 3840 case Intrinsic::nvvm_atomic_load_dec_32: 3841 3842 case Intrinsic::nvvm_atomic_add_gen_f_cta: 3843 case Intrinsic::nvvm_atomic_add_gen_f_sys: 3844 case Intrinsic::nvvm_atomic_add_gen_i_cta: 3845 case Intrinsic::nvvm_atomic_add_gen_i_sys: 3846 case Intrinsic::nvvm_atomic_and_gen_i_cta: 3847 case Intrinsic::nvvm_atomic_and_gen_i_sys: 3848 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 3849 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 3850 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 3851 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 3852 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 3853 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 3854 case Intrinsic::nvvm_atomic_max_gen_i_cta: 3855 case Intrinsic::nvvm_atomic_max_gen_i_sys: 3856 case Intrinsic::nvvm_atomic_min_gen_i_cta: 3857 case Intrinsic::nvvm_atomic_min_gen_i_sys: 3858 case Intrinsic::nvvm_atomic_or_gen_i_cta: 3859 case Intrinsic::nvvm_atomic_or_gen_i_sys: 3860 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 3861 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 3862 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 3863 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 3864 auto &DL = I.getModule()->getDataLayout(); 3865 Info.opc = ISD::INTRINSIC_W_CHAIN; 3866 Info.memVT = getValueType(DL, I.getType()); 3867 Info.ptrVal = I.getArgOperand(0); 3868 Info.offset = 0; 3869 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3870 Info.align.reset(); 3871 return true; 3872 } 3873 3874 case Intrinsic::nvvm_ldu_global_i: 3875 case Intrinsic::nvvm_ldu_global_f: 3876 case Intrinsic::nvvm_ldu_global_p: { 3877 auto &DL = I.getModule()->getDataLayout(); 3878 Info.opc = ISD::INTRINSIC_W_CHAIN; 3879 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 3880 Info.memVT = getValueType(DL, I.getType()); 3881 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 3882 Info.memVT = getPointerTy(DL); 3883 else 3884 Info.memVT = getValueType(DL, I.getType()); 3885 Info.ptrVal = I.getArgOperand(0); 3886 Info.offset = 0; 3887 Info.flags = MachineMemOperand::MOLoad; 3888 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3889 3890 return true; 3891 } 3892 case Intrinsic::nvvm_ldg_global_i: 3893 case Intrinsic::nvvm_ldg_global_f: 3894 case Intrinsic::nvvm_ldg_global_p: { 3895 auto &DL = I.getModule()->getDataLayout(); 3896 3897 Info.opc = ISD::INTRINSIC_W_CHAIN; 3898 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 3899 Info.memVT = getValueType(DL, I.getType()); 3900 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 3901 Info.memVT = getPointerTy(DL); 3902 else 3903 Info.memVT = getValueType(DL, I.getType()); 3904 Info.ptrVal = I.getArgOperand(0); 3905 Info.offset = 0; 3906 Info.flags = MachineMemOperand::MOLoad; 3907 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3908 3909 return true; 3910 } 3911 3912 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3913 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3914 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3915 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3916 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3917 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3918 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3919 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3920 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3921 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3922 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3923 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3924 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3925 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3926 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3927 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3928 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3929 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3930 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3931 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3932 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3933 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3934 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3935 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3936 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3937 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3938 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3939 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3940 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3941 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3942 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3943 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3944 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3945 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3946 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3947 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3948 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3949 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3950 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3951 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3952 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3953 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3954 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3955 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3956 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3957 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3958 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3959 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3960 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3961 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3962 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3963 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3964 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3965 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3966 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3967 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3968 Info.opc = getOpcForTextureInstr(Intrinsic); 3969 Info.memVT = MVT::v4f32; 3970 Info.ptrVal = nullptr; 3971 Info.offset = 0; 3972 Info.flags = MachineMemOperand::MOLoad; 3973 Info.align = Align(16); 3974 return true; 3975 3976 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3977 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3978 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3979 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3980 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3981 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3982 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3983 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3984 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3985 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3986 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3987 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3988 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3989 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3990 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3991 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3992 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3993 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3994 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3995 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3996 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3997 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3998 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3999 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4000 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4001 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4002 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4003 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4004 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4005 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4006 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4007 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4008 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4009 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4010 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4011 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4012 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4013 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4014 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4015 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4016 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4017 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4018 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4019 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4020 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4021 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4022 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4023 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4024 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4025 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4026 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4027 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4028 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4029 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4030 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4031 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4032 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4033 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4034 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4035 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4036 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4037 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4038 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4039 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4040 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4041 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4042 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4043 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4044 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4045 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4046 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4047 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4048 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4049 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4050 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4051 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4052 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4053 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4054 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4055 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4056 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4057 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4058 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4059 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4060 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4061 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4062 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4063 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4064 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4065 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4066 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4067 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4068 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4069 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4070 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4071 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4072 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4073 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4074 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4075 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4076 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4077 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4078 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4079 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4080 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4081 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4082 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4083 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4084 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4085 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4086 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4087 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4088 Info.opc = getOpcForTextureInstr(Intrinsic); 4089 Info.memVT = MVT::v4i32; 4090 Info.ptrVal = nullptr; 4091 Info.offset = 0; 4092 Info.flags = MachineMemOperand::MOLoad; 4093 Info.align = Align(16); 4094 return true; 4095 4096 case Intrinsic::nvvm_suld_1d_i8_clamp: 4097 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4098 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4099 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4100 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4101 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4102 case Intrinsic::nvvm_suld_2d_i8_clamp: 4103 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4104 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4105 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4106 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4107 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4108 case Intrinsic::nvvm_suld_3d_i8_clamp: 4109 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4110 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4111 case Intrinsic::nvvm_suld_1d_i8_trap: 4112 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4113 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4114 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4115 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4116 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4117 case Intrinsic::nvvm_suld_2d_i8_trap: 4118 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4119 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4120 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4121 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4122 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4123 case Intrinsic::nvvm_suld_3d_i8_trap: 4124 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4125 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4126 case Intrinsic::nvvm_suld_1d_i8_zero: 4127 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4128 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4129 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4130 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4131 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4132 case Intrinsic::nvvm_suld_2d_i8_zero: 4133 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4134 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4135 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4136 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4137 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4138 case Intrinsic::nvvm_suld_3d_i8_zero: 4139 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4140 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4141 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4142 Info.memVT = MVT::i8; 4143 Info.ptrVal = nullptr; 4144 Info.offset = 0; 4145 Info.flags = MachineMemOperand::MOLoad; 4146 Info.align = Align(16); 4147 return true; 4148 4149 case Intrinsic::nvvm_suld_1d_i16_clamp: 4150 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4151 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4152 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4153 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4154 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4155 case Intrinsic::nvvm_suld_2d_i16_clamp: 4156 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4157 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4158 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4159 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4160 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4161 case Intrinsic::nvvm_suld_3d_i16_clamp: 4162 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4163 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4164 case Intrinsic::nvvm_suld_1d_i16_trap: 4165 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4166 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4167 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4168 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4169 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4170 case Intrinsic::nvvm_suld_2d_i16_trap: 4171 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4172 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4173 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4174 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4175 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4176 case Intrinsic::nvvm_suld_3d_i16_trap: 4177 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4178 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4179 case Intrinsic::nvvm_suld_1d_i16_zero: 4180 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4181 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4182 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4183 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4184 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4185 case Intrinsic::nvvm_suld_2d_i16_zero: 4186 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4187 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4188 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4189 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4190 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4191 case Intrinsic::nvvm_suld_3d_i16_zero: 4192 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4193 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4194 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4195 Info.memVT = MVT::i16; 4196 Info.ptrVal = nullptr; 4197 Info.offset = 0; 4198 Info.flags = MachineMemOperand::MOLoad; 4199 Info.align = Align(16); 4200 return true; 4201 4202 case Intrinsic::nvvm_suld_1d_i32_clamp: 4203 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4204 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4205 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4206 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4207 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4208 case Intrinsic::nvvm_suld_2d_i32_clamp: 4209 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4210 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4211 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4212 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4213 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4214 case Intrinsic::nvvm_suld_3d_i32_clamp: 4215 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4216 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4217 case Intrinsic::nvvm_suld_1d_i32_trap: 4218 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4219 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4220 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4221 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4222 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4223 case Intrinsic::nvvm_suld_2d_i32_trap: 4224 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4225 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4226 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4227 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4228 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4229 case Intrinsic::nvvm_suld_3d_i32_trap: 4230 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4231 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4232 case Intrinsic::nvvm_suld_1d_i32_zero: 4233 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4234 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4235 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4236 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4237 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4238 case Intrinsic::nvvm_suld_2d_i32_zero: 4239 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4240 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4241 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4242 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4243 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4244 case Intrinsic::nvvm_suld_3d_i32_zero: 4245 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4246 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4247 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4248 Info.memVT = MVT::i32; 4249 Info.ptrVal = nullptr; 4250 Info.offset = 0; 4251 Info.flags = MachineMemOperand::MOLoad; 4252 Info.align = Align(16); 4253 return true; 4254 4255 case Intrinsic::nvvm_suld_1d_i64_clamp: 4256 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4257 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4258 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4259 case Intrinsic::nvvm_suld_2d_i64_clamp: 4260 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4261 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4262 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4263 case Intrinsic::nvvm_suld_3d_i64_clamp: 4264 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4265 case Intrinsic::nvvm_suld_1d_i64_trap: 4266 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4267 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4268 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4269 case Intrinsic::nvvm_suld_2d_i64_trap: 4270 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4271 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4272 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4273 case Intrinsic::nvvm_suld_3d_i64_trap: 4274 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4275 case Intrinsic::nvvm_suld_1d_i64_zero: 4276 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4277 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4278 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4279 case Intrinsic::nvvm_suld_2d_i64_zero: 4280 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4281 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4282 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4283 case Intrinsic::nvvm_suld_3d_i64_zero: 4284 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4285 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4286 Info.memVT = MVT::i64; 4287 Info.ptrVal = nullptr; 4288 Info.offset = 0; 4289 Info.flags = MachineMemOperand::MOLoad; 4290 Info.align = Align(16); 4291 return true; 4292 } 4293 return false; 4294 } 4295 4296 /// isLegalAddressingMode - Return true if the addressing mode represented 4297 /// by AM is legal for this target, for a load/store of the specified type. 4298 /// Used to guide target specific optimizations, like loop strength reduction 4299 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4300 /// (CodeGenPrepare.cpp) 4301 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4302 const AddrMode &AM, Type *Ty, 4303 unsigned AS, Instruction *I) const { 4304 // AddrMode - This represents an addressing mode of: 4305 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4306 // 4307 // The legal address modes are 4308 // - [avar] 4309 // - [areg] 4310 // - [areg+immoff] 4311 // - [immAddr] 4312 4313 if (AM.BaseGV) { 4314 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4315 } 4316 4317 switch (AM.Scale) { 4318 case 0: // "r", "r+i" or "i" is allowed 4319 break; 4320 case 1: 4321 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4322 return false; 4323 // Otherwise we have r+i. 4324 break; 4325 default: 4326 // No scale > 1 is allowed 4327 return false; 4328 } 4329 return true; 4330 } 4331 4332 //===----------------------------------------------------------------------===// 4333 // NVPTX Inline Assembly Support 4334 //===----------------------------------------------------------------------===// 4335 4336 /// getConstraintType - Given a constraint letter, return the type of 4337 /// constraint it is for this target. 4338 NVPTXTargetLowering::ConstraintType 4339 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4340 if (Constraint.size() == 1) { 4341 switch (Constraint[0]) { 4342 default: 4343 break; 4344 case 'b': 4345 case 'r': 4346 case 'h': 4347 case 'c': 4348 case 'l': 4349 case 'f': 4350 case 'd': 4351 case '0': 4352 case 'N': 4353 return C_RegisterClass; 4354 } 4355 } 4356 return TargetLowering::getConstraintType(Constraint); 4357 } 4358 4359 std::pair<unsigned, const TargetRegisterClass *> 4360 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4361 StringRef Constraint, 4362 MVT VT) const { 4363 if (Constraint.size() == 1) { 4364 switch (Constraint[0]) { 4365 case 'b': 4366 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4367 case 'c': 4368 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4369 case 'h': 4370 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4371 case 'r': 4372 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4373 case 'l': 4374 case 'N': 4375 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4376 case 'f': 4377 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4378 case 'd': 4379 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4380 } 4381 } 4382 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4383 } 4384 4385 //===----------------------------------------------------------------------===// 4386 // NVPTX DAG Combining 4387 //===----------------------------------------------------------------------===// 4388 4389 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4390 CodeGenOpt::Level OptLevel) const { 4391 // Always honor command-line argument 4392 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4393 return FMAContractLevelOpt > 0; 4394 4395 // Do not contract if we're not optimizing the code. 4396 if (OptLevel == 0) 4397 return false; 4398 4399 // Honor TargetOptions flags that explicitly say fusion is okay. 4400 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4401 return true; 4402 4403 return allowUnsafeFPMath(MF); 4404 } 4405 4406 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4407 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4408 if (MF.getTarget().Options.UnsafeFPMath) 4409 return true; 4410 4411 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4412 const Function &F = MF.getFunction(); 4413 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 4414 } 4415 4416 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4417 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4418 /// called with the default operands, and if that fails, with commuted 4419 /// operands. 4420 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4421 TargetLowering::DAGCombinerInfo &DCI, 4422 const NVPTXSubtarget &Subtarget, 4423 CodeGenOpt::Level OptLevel) { 4424 SelectionDAG &DAG = DCI.DAG; 4425 // Skip non-integer, non-scalar case 4426 EVT VT=N0.getValueType(); 4427 if (VT.isVector()) 4428 return SDValue(); 4429 4430 // fold (add (mul a, b), c) -> (mad a, b, c) 4431 // 4432 if (N0.getOpcode() == ISD::MUL) { 4433 assert (VT.isInteger()); 4434 // For integer: 4435 // Since integer multiply-add costs the same as integer multiply 4436 // but is more costly than integer add, do the fusion only when 4437 // the mul is only used in the add. 4438 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4439 !N0.getNode()->hasOneUse()) 4440 return SDValue(); 4441 4442 // Do the folding 4443 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4444 N0.getOperand(0), N0.getOperand(1), N1); 4445 } 4446 else if (N0.getOpcode() == ISD::FMUL) { 4447 if (VT == MVT::f32 || VT == MVT::f64) { 4448 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4449 &DAG.getTargetLoweringInfo()); 4450 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4451 return SDValue(); 4452 4453 // For floating point: 4454 // Do the fusion only when the mul has less than 5 uses and all 4455 // are add. 4456 // The heuristic is that if a use is not an add, then that use 4457 // cannot be fused into fma, therefore mul is still needed anyway. 4458 // If there are more than 4 uses, even if they are all add, fusing 4459 // them will increase register pressue. 4460 // 4461 int numUses = 0; 4462 int nonAddCount = 0; 4463 for (const SDNode *User : N0.getNode()->uses()) { 4464 numUses++; 4465 if (User->getOpcode() != ISD::FADD) 4466 ++nonAddCount; 4467 } 4468 if (numUses >= 5) 4469 return SDValue(); 4470 if (nonAddCount) { 4471 int orderNo = N->getIROrder(); 4472 int orderNo2 = N0.getNode()->getIROrder(); 4473 // simple heuristics here for considering potential register 4474 // pressure, the logics here is that the differnce are used 4475 // to measure the distance between def and use, the longer distance 4476 // more likely cause register pressure. 4477 if (orderNo - orderNo2 < 500) 4478 return SDValue(); 4479 4480 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4481 // which guarantees that the FMA will not increase register pressure at node N. 4482 bool opIsLive = false; 4483 const SDNode *left = N0.getOperand(0).getNode(); 4484 const SDNode *right = N0.getOperand(1).getNode(); 4485 4486 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4487 opIsLive = true; 4488 4489 if (!opIsLive) 4490 for (const SDNode *User : left->uses()) { 4491 int orderNo3 = User->getIROrder(); 4492 if (orderNo3 > orderNo) { 4493 opIsLive = true; 4494 break; 4495 } 4496 } 4497 4498 if (!opIsLive) 4499 for (const SDNode *User : right->uses()) { 4500 int orderNo3 = User->getIROrder(); 4501 if (orderNo3 > orderNo) { 4502 opIsLive = true; 4503 break; 4504 } 4505 } 4506 4507 if (!opIsLive) 4508 return SDValue(); 4509 } 4510 4511 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4512 N0.getOperand(0), N0.getOperand(1), N1); 4513 } 4514 } 4515 4516 return SDValue(); 4517 } 4518 4519 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4520 /// 4521 static SDValue PerformADDCombine(SDNode *N, 4522 TargetLowering::DAGCombinerInfo &DCI, 4523 const NVPTXSubtarget &Subtarget, 4524 CodeGenOpt::Level OptLevel) { 4525 SDValue N0 = N->getOperand(0); 4526 SDValue N1 = N->getOperand(1); 4527 4528 // First try with the default operand order. 4529 if (SDValue Result = 4530 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4531 return Result; 4532 4533 // If that didn't work, try again with the operands commuted. 4534 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4535 } 4536 4537 static SDValue PerformANDCombine(SDNode *N, 4538 TargetLowering::DAGCombinerInfo &DCI) { 4539 // The type legalizer turns a vector load of i8 values into a zextload to i16 4540 // registers, optionally ANY_EXTENDs it (if target type is integer), 4541 // and ANDs off the high 8 bits. Since we turn this load into a 4542 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4543 // nodes. Do that here. 4544 SDValue Val = N->getOperand(0); 4545 SDValue Mask = N->getOperand(1); 4546 4547 if (isa<ConstantSDNode>(Val)) { 4548 std::swap(Val, Mask); 4549 } 4550 4551 SDValue AExt; 4552 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4553 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4554 AExt = Val; 4555 Val = Val->getOperand(0); 4556 } 4557 4558 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4559 Val = Val->getOperand(0); 4560 } 4561 4562 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4563 Val->getOpcode() == NVPTXISD::LoadV4) { 4564 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4565 if (!MaskCnst) { 4566 // Not an AND with a constant 4567 return SDValue(); 4568 } 4569 4570 uint64_t MaskVal = MaskCnst->getZExtValue(); 4571 if (MaskVal != 0xff) { 4572 // Not an AND that chops off top 8 bits 4573 return SDValue(); 4574 } 4575 4576 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4577 if (!Mem) { 4578 // Not a MemSDNode?!? 4579 return SDValue(); 4580 } 4581 4582 EVT MemVT = Mem->getMemoryVT(); 4583 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4584 // We only handle the i8 case 4585 return SDValue(); 4586 } 4587 4588 unsigned ExtType = 4589 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4590 getZExtValue(); 4591 if (ExtType == ISD::SEXTLOAD) { 4592 // If for some reason the load is a sextload, the and is needed to zero 4593 // out the high 8 bits 4594 return SDValue(); 4595 } 4596 4597 bool AddTo = false; 4598 if (AExt.getNode() != nullptr) { 4599 // Re-insert the ext as a zext. 4600 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4601 AExt.getValueType(), Val); 4602 AddTo = true; 4603 } 4604 4605 // If we get here, the AND is unnecessary. Just replace it with the load 4606 DCI.CombineTo(N, Val, AddTo); 4607 } 4608 4609 return SDValue(); 4610 } 4611 4612 static SDValue PerformREMCombine(SDNode *N, 4613 TargetLowering::DAGCombinerInfo &DCI, 4614 CodeGenOpt::Level OptLevel) { 4615 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4616 4617 // Don't do anything at less than -O2. 4618 if (OptLevel < CodeGenOpt::Default) 4619 return SDValue(); 4620 4621 SelectionDAG &DAG = DCI.DAG; 4622 SDLoc DL(N); 4623 EVT VT = N->getValueType(0); 4624 bool IsSigned = N->getOpcode() == ISD::SREM; 4625 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4626 4627 const SDValue &Num = N->getOperand(0); 4628 const SDValue &Den = N->getOperand(1); 4629 4630 for (const SDNode *U : Num->uses()) { 4631 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4632 U->getOperand(1) == Den) { 4633 // Num % Den -> Num - (Num / Den) * Den 4634 return DAG.getNode(ISD::SUB, DL, VT, Num, 4635 DAG.getNode(ISD::MUL, DL, VT, 4636 DAG.getNode(DivOpc, DL, VT, Num, Den), 4637 Den)); 4638 } 4639 } 4640 return SDValue(); 4641 } 4642 4643 enum OperandSignedness { 4644 Signed = 0, 4645 Unsigned, 4646 Unknown 4647 }; 4648 4649 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4650 /// that can be demoted to \p OptSize bits without loss of information. The 4651 /// signedness of the operand, if determinable, is placed in \p S. 4652 static bool IsMulWideOperandDemotable(SDValue Op, 4653 unsigned OptSize, 4654 OperandSignedness &S) { 4655 S = Unknown; 4656 4657 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4658 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4659 EVT OrigVT = Op.getOperand(0).getValueType(); 4660 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4661 S = Signed; 4662 return true; 4663 } 4664 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4665 EVT OrigVT = Op.getOperand(0).getValueType(); 4666 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4667 S = Unsigned; 4668 return true; 4669 } 4670 } 4671 4672 return false; 4673 } 4674 4675 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4676 /// be demoted to \p OptSize bits without loss of information. If the operands 4677 /// contain a constant, it should appear as the RHS operand. The signedness of 4678 /// the operands is placed in \p IsSigned. 4679 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4680 unsigned OptSize, 4681 bool &IsSigned) { 4682 OperandSignedness LHSSign; 4683 4684 // The LHS operand must be a demotable op 4685 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4686 return false; 4687 4688 // We should have been able to determine the signedness from the LHS 4689 if (LHSSign == Unknown) 4690 return false; 4691 4692 IsSigned = (LHSSign == Signed); 4693 4694 // The RHS can be a demotable op or a constant 4695 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4696 const APInt &Val = CI->getAPIntValue(); 4697 if (LHSSign == Unsigned) { 4698 return Val.isIntN(OptSize); 4699 } else { 4700 return Val.isSignedIntN(OptSize); 4701 } 4702 } else { 4703 OperandSignedness RHSSign; 4704 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4705 return false; 4706 4707 return LHSSign == RHSSign; 4708 } 4709 } 4710 4711 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4712 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4713 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4714 /// amount. 4715 static SDValue TryMULWIDECombine(SDNode *N, 4716 TargetLowering::DAGCombinerInfo &DCI) { 4717 EVT MulType = N->getValueType(0); 4718 if (MulType != MVT::i32 && MulType != MVT::i64) { 4719 return SDValue(); 4720 } 4721 4722 SDLoc DL(N); 4723 unsigned OptSize = MulType.getSizeInBits() >> 1; 4724 SDValue LHS = N->getOperand(0); 4725 SDValue RHS = N->getOperand(1); 4726 4727 // Canonicalize the multiply so the constant (if any) is on the right 4728 if (N->getOpcode() == ISD::MUL) { 4729 if (isa<ConstantSDNode>(LHS)) { 4730 std::swap(LHS, RHS); 4731 } 4732 } 4733 4734 // If we have a SHL, determine the actual multiply amount 4735 if (N->getOpcode() == ISD::SHL) { 4736 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4737 if (!ShlRHS) { 4738 return SDValue(); 4739 } 4740 4741 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4742 unsigned BitWidth = MulType.getSizeInBits(); 4743 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4744 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4745 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4746 } else { 4747 return SDValue(); 4748 } 4749 } 4750 4751 bool Signed; 4752 // Verify that our operands are demotable 4753 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4754 return SDValue(); 4755 } 4756 4757 EVT DemotedVT; 4758 if (MulType == MVT::i32) { 4759 DemotedVT = MVT::i16; 4760 } else { 4761 DemotedVT = MVT::i32; 4762 } 4763 4764 // Truncate the operands to the correct size. Note that these are just for 4765 // type consistency and will (likely) be eliminated in later phases. 4766 SDValue TruncLHS = 4767 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 4768 SDValue TruncRHS = 4769 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 4770 4771 unsigned Opc; 4772 if (Signed) { 4773 Opc = NVPTXISD::MUL_WIDE_SIGNED; 4774 } else { 4775 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 4776 } 4777 4778 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 4779 } 4780 4781 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 4782 static SDValue PerformMULCombine(SDNode *N, 4783 TargetLowering::DAGCombinerInfo &DCI, 4784 CodeGenOpt::Level OptLevel) { 4785 if (OptLevel > 0) { 4786 // Try mul.wide combining at OptLevel > 0 4787 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4788 return Ret; 4789 } 4790 4791 return SDValue(); 4792 } 4793 4794 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 4795 static SDValue PerformSHLCombine(SDNode *N, 4796 TargetLowering::DAGCombinerInfo &DCI, 4797 CodeGenOpt::Level OptLevel) { 4798 if (OptLevel > 0) { 4799 // Try mul.wide combining at OptLevel > 0 4800 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4801 return Ret; 4802 } 4803 4804 return SDValue(); 4805 } 4806 4807 static SDValue PerformSETCCCombine(SDNode *N, 4808 TargetLowering::DAGCombinerInfo &DCI) { 4809 EVT CCType = N->getValueType(0); 4810 SDValue A = N->getOperand(0); 4811 SDValue B = N->getOperand(1); 4812 4813 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 4814 return SDValue(); 4815 4816 SDLoc DL(N); 4817 // setp.f16x2 returns two scalar predicates, which we need to 4818 // convert back to v2i1. The returned result will be scalarized by 4819 // the legalizer, but the comparison will remain a single vector 4820 // instruction. 4821 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 4822 DCI.DAG.getVTList(MVT::i1, MVT::i1), 4823 {A, B, N->getOperand(2)}); 4824 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 4825 CCNode.getValue(1)); 4826 } 4827 4828 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 4829 DAGCombinerInfo &DCI) const { 4830 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 4831 switch (N->getOpcode()) { 4832 default: break; 4833 case ISD::ADD: 4834 case ISD::FADD: 4835 return PerformADDCombine(N, DCI, STI, OptLevel); 4836 case ISD::MUL: 4837 return PerformMULCombine(N, DCI, OptLevel); 4838 case ISD::SHL: 4839 return PerformSHLCombine(N, DCI, OptLevel); 4840 case ISD::AND: 4841 return PerformANDCombine(N, DCI); 4842 case ISD::UREM: 4843 case ISD::SREM: 4844 return PerformREMCombine(N, DCI, OptLevel); 4845 case ISD::SETCC: 4846 return PerformSETCCCombine(N, DCI); 4847 } 4848 return SDValue(); 4849 } 4850 4851 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 4852 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 4853 SmallVectorImpl<SDValue> &Results) { 4854 EVT ResVT = N->getValueType(0); 4855 SDLoc DL(N); 4856 4857 assert(ResVT.isVector() && "Vector load must have vector type"); 4858 4859 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 4860 // legal. We can (and should) split that into 2 loads of <2 x double> here 4861 // but I'm leaving that as a TODO for now. 4862 assert(ResVT.isSimple() && "Can only handle simple types"); 4863 switch (ResVT.getSimpleVT().SimpleTy) { 4864 default: 4865 return; 4866 case MVT::v2i8: 4867 case MVT::v2i16: 4868 case MVT::v2i32: 4869 case MVT::v2i64: 4870 case MVT::v2f16: 4871 case MVT::v2f32: 4872 case MVT::v2f64: 4873 case MVT::v4i8: 4874 case MVT::v4i16: 4875 case MVT::v4i32: 4876 case MVT::v4f16: 4877 case MVT::v4f32: 4878 case MVT::v8f16: // <4 x f16x2> 4879 // This is a "native" vector type 4880 break; 4881 } 4882 4883 LoadSDNode *LD = cast<LoadSDNode>(N); 4884 4885 Align Alignment = LD->getAlign(); 4886 auto &TD = DAG.getDataLayout(); 4887 Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext())); 4888 if (Alignment < PrefAlign) { 4889 // This load is not sufficiently aligned, so bail out and let this vector 4890 // load be scalarized. Note that we may still be able to emit smaller 4891 // vector loads. For example, if we are loading a <4 x float> with an 4892 // alignment of 8, this check will fail but the legalizer will try again 4893 // with 2 x <2 x float>, which will succeed with an alignment of 8. 4894 return; 4895 } 4896 4897 EVT EltVT = ResVT.getVectorElementType(); 4898 unsigned NumElts = ResVT.getVectorNumElements(); 4899 4900 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 4901 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4902 // loaded type to i16 and propagate the "real" type as the memory type. 4903 bool NeedTrunc = false; 4904 if (EltVT.getSizeInBits() < 16) { 4905 EltVT = MVT::i16; 4906 NeedTrunc = true; 4907 } 4908 4909 unsigned Opcode = 0; 4910 SDVTList LdResVTs; 4911 bool LoadF16x2 = false; 4912 4913 switch (NumElts) { 4914 default: 4915 return; 4916 case 2: 4917 Opcode = NVPTXISD::LoadV2; 4918 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4919 break; 4920 case 4: { 4921 Opcode = NVPTXISD::LoadV4; 4922 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4923 LdResVTs = DAG.getVTList(ListVTs); 4924 break; 4925 } 4926 case 8: { 4927 // v8f16 is a special case. PTX doesn't have ld.v8.f16 4928 // instruction. Instead, we split the vector into v2f16 chunks and 4929 // load them with ld.v4.b32. 4930 assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); 4931 LoadF16x2 = true; 4932 Opcode = NVPTXISD::LoadV4; 4933 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, 4934 MVT::Other}; 4935 LdResVTs = DAG.getVTList(ListVTs); 4936 break; 4937 } 4938 } 4939 4940 // Copy regular operands 4941 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 4942 4943 // The select routine does not have access to the LoadSDNode instance, so 4944 // pass along the extension information 4945 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 4946 4947 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 4948 LD->getMemoryVT(), 4949 LD->getMemOperand()); 4950 4951 SmallVector<SDValue, 8> ScalarRes; 4952 if (LoadF16x2) { 4953 // Split v2f16 subvectors back into individual elements. 4954 NumElts /= 2; 4955 for (unsigned i = 0; i < NumElts; ++i) { 4956 SDValue SubVector = NewLD.getValue(i); 4957 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4958 DAG.getIntPtrConstant(0, DL)); 4959 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4960 DAG.getIntPtrConstant(1, DL)); 4961 ScalarRes.push_back(E0); 4962 ScalarRes.push_back(E1); 4963 } 4964 } else { 4965 for (unsigned i = 0; i < NumElts; ++i) { 4966 SDValue Res = NewLD.getValue(i); 4967 if (NeedTrunc) 4968 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 4969 ScalarRes.push_back(Res); 4970 } 4971 } 4972 4973 SDValue LoadChain = NewLD.getValue(NumElts); 4974 4975 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 4976 4977 Results.push_back(BuildVec); 4978 Results.push_back(LoadChain); 4979 } 4980 4981 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 4982 SmallVectorImpl<SDValue> &Results) { 4983 SDValue Chain = N->getOperand(0); 4984 SDValue Intrin = N->getOperand(1); 4985 SDLoc DL(N); 4986 4987 // Get the intrinsic ID 4988 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 4989 switch (IntrinNo) { 4990 default: 4991 return; 4992 case Intrinsic::nvvm_ldg_global_i: 4993 case Intrinsic::nvvm_ldg_global_f: 4994 case Intrinsic::nvvm_ldg_global_p: 4995 case Intrinsic::nvvm_ldu_global_i: 4996 case Intrinsic::nvvm_ldu_global_f: 4997 case Intrinsic::nvvm_ldu_global_p: { 4998 EVT ResVT = N->getValueType(0); 4999 5000 if (ResVT.isVector()) { 5001 // Vector LDG/LDU 5002 5003 unsigned NumElts = ResVT.getVectorNumElements(); 5004 EVT EltVT = ResVT.getVectorElementType(); 5005 5006 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5007 // legalization. 5008 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5009 // loaded type to i16 and propagate the "real" type as the memory type. 5010 bool NeedTrunc = false; 5011 if (EltVT.getSizeInBits() < 16) { 5012 EltVT = MVT::i16; 5013 NeedTrunc = true; 5014 } 5015 5016 unsigned Opcode = 0; 5017 SDVTList LdResVTs; 5018 5019 switch (NumElts) { 5020 default: 5021 return; 5022 case 2: 5023 switch (IntrinNo) { 5024 default: 5025 return; 5026 case Intrinsic::nvvm_ldg_global_i: 5027 case Intrinsic::nvvm_ldg_global_f: 5028 case Intrinsic::nvvm_ldg_global_p: 5029 Opcode = NVPTXISD::LDGV2; 5030 break; 5031 case Intrinsic::nvvm_ldu_global_i: 5032 case Intrinsic::nvvm_ldu_global_f: 5033 case Intrinsic::nvvm_ldu_global_p: 5034 Opcode = NVPTXISD::LDUV2; 5035 break; 5036 } 5037 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5038 break; 5039 case 4: { 5040 switch (IntrinNo) { 5041 default: 5042 return; 5043 case Intrinsic::nvvm_ldg_global_i: 5044 case Intrinsic::nvvm_ldg_global_f: 5045 case Intrinsic::nvvm_ldg_global_p: 5046 Opcode = NVPTXISD::LDGV4; 5047 break; 5048 case Intrinsic::nvvm_ldu_global_i: 5049 case Intrinsic::nvvm_ldu_global_f: 5050 case Intrinsic::nvvm_ldu_global_p: 5051 Opcode = NVPTXISD::LDUV4; 5052 break; 5053 } 5054 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5055 LdResVTs = DAG.getVTList(ListVTs); 5056 break; 5057 } 5058 } 5059 5060 SmallVector<SDValue, 8> OtherOps; 5061 5062 // Copy regular operands 5063 5064 OtherOps.push_back(Chain); // Chain 5065 // Skip operand 1 (intrinsic ID) 5066 // Others 5067 OtherOps.append(N->op_begin() + 2, N->op_end()); 5068 5069 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5070 5071 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5072 MemSD->getMemoryVT(), 5073 MemSD->getMemOperand()); 5074 5075 SmallVector<SDValue, 4> ScalarRes; 5076 5077 for (unsigned i = 0; i < NumElts; ++i) { 5078 SDValue Res = NewLD.getValue(i); 5079 if (NeedTrunc) 5080 Res = 5081 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5082 ScalarRes.push_back(Res); 5083 } 5084 5085 SDValue LoadChain = NewLD.getValue(NumElts); 5086 5087 SDValue BuildVec = 5088 DAG.getBuildVector(ResVT, DL, ScalarRes); 5089 5090 Results.push_back(BuildVec); 5091 Results.push_back(LoadChain); 5092 } else { 5093 // i8 LDG/LDU 5094 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5095 "Custom handling of non-i8 ldu/ldg?"); 5096 5097 // Just copy all operands as-is 5098 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5099 5100 // Force output to i16 5101 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5102 5103 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5104 5105 // We make sure the memory type is i8, which will be used during isel 5106 // to select the proper instruction. 5107 SDValue NewLD = 5108 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5109 MVT::i8, MemSD->getMemOperand()); 5110 5111 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5112 NewLD.getValue(0))); 5113 Results.push_back(NewLD.getValue(1)); 5114 } 5115 } 5116 } 5117 } 5118 5119 void NVPTXTargetLowering::ReplaceNodeResults( 5120 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5121 switch (N->getOpcode()) { 5122 default: 5123 report_fatal_error("Unhandled custom legalization"); 5124 case ISD::LOAD: 5125 ReplaceLoadVector(N, DAG, Results); 5126 return; 5127 case ISD::INTRINSIC_W_CHAIN: 5128 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5129 return; 5130 } 5131 } 5132 5133 // Pin NVPTXTargetObjectFile's vtables to this file. 5134 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} 5135 5136 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5137 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5138 return getDataSection(); 5139 } 5140