1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/CodeGen/Analysis.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/CodeGen/SelectionDAGNodes.h" 29 #include "llvm/CodeGen/TargetCallingConv.h" 30 #include "llvm/CodeGen/TargetLowering.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/Argument.h" 33 #include "llvm/IR/Attributes.h" 34 #include "llvm/IR/CallSite.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/GlobalValue.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/IntrinsicsNVPTX.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/Type.h" 45 #include "llvm/IR/Value.h" 46 #include "llvm/Support/Casting.h" 47 #include "llvm/Support/CodeGen.h" 48 #include "llvm/Support/CommandLine.h" 49 #include "llvm/Support/ErrorHandling.h" 50 #include "llvm/Support/MachineValueType.h" 51 #include "llvm/Support/MathExtras.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <iterator> 59 #include <sstream> 60 #include <string> 61 #include <utility> 62 #include <vector> 63 64 #define DEBUG_TYPE "nvptx-lower" 65 66 using namespace llvm; 67 68 static unsigned int uniqueCallSite = 0; 69 70 static cl::opt<bool> sched4reg( 71 "nvptx-sched4reg", 72 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 73 74 static cl::opt<unsigned> 75 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, 76 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 77 " 1: do it 2: do it aggressively"), 78 cl::init(2)); 79 80 static cl::opt<int> UsePrecDivF32( 81 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, 82 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 83 " IEEE Compliant F32 div.rnd if available."), 84 cl::init(2)); 85 86 static cl::opt<bool> UsePrecSqrtF32( 87 "nvptx-prec-sqrtf32", cl::Hidden, 88 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 89 cl::init(true)); 90 91 static cl::opt<bool> FtzEnabled( 92 "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, 93 cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), 94 cl::init(false)); 95 96 int NVPTXTargetLowering::getDivF32Level() const { 97 if (UsePrecDivF32.getNumOccurrences() > 0) { 98 // If nvptx-prec-div32=N is used on the command-line, always honor it 99 return UsePrecDivF32; 100 } else { 101 // Otherwise, use div.approx if fast math is enabled 102 if (getTargetMachine().Options.UnsafeFPMath) 103 return 0; 104 else 105 return 2; 106 } 107 } 108 109 bool NVPTXTargetLowering::usePrecSqrtF32() const { 110 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 111 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 112 return UsePrecSqrtF32; 113 } else { 114 // Otherwise, use sqrt.approx if fast math is enabled 115 return !getTargetMachine().Options.UnsafeFPMath; 116 } 117 } 118 119 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 120 // TODO: Get rid of this flag; there can be only one way to do this. 121 if (FtzEnabled.getNumOccurrences() > 0) { 122 // If nvptx-f32ftz is used on the command-line, always honor it 123 return FtzEnabled; 124 } else { 125 const Function &F = MF.getFunction(); 126 // Otherwise, check for an nvptx-f32ftz attribute on the function 127 if (F.hasFnAttribute("nvptx-f32ftz")) 128 return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; 129 else 130 return false; 131 } 132 } 133 134 static bool IsPTXVectorType(MVT VT) { 135 switch (VT.SimpleTy) { 136 default: 137 return false; 138 case MVT::v2i1: 139 case MVT::v4i1: 140 case MVT::v2i8: 141 case MVT::v4i8: 142 case MVT::v2i16: 143 case MVT::v4i16: 144 case MVT::v2i32: 145 case MVT::v4i32: 146 case MVT::v2i64: 147 case MVT::v2f16: 148 case MVT::v4f16: 149 case MVT::v8f16: // <4 x f16x2> 150 case MVT::v2f32: 151 case MVT::v4f32: 152 case MVT::v2f64: 153 return true; 154 } 155 } 156 157 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 158 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 159 /// into their primitive components. 160 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 161 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 162 /// LowerCall, and LowerReturn. 163 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 164 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 165 SmallVectorImpl<uint64_t> *Offsets = nullptr, 166 uint64_t StartingOffset = 0) { 167 SmallVector<EVT, 16> TempVTs; 168 SmallVector<uint64_t, 16> TempOffsets; 169 170 // Special case for i128 - decompose to (i64, i64) 171 if (Ty->isIntegerTy(128)) { 172 ValueVTs.push_back(EVT(MVT::i64)); 173 ValueVTs.push_back(EVT(MVT::i64)); 174 175 if (Offsets) { 176 Offsets->push_back(StartingOffset + 0); 177 Offsets->push_back(StartingOffset + 8); 178 } 179 180 return; 181 } 182 183 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 184 if (StructType *STy = dyn_cast<StructType>(Ty)) { 185 auto const *SL = DL.getStructLayout(STy); 186 auto ElementNum = 0; 187 for(auto *EI : STy->elements()) { 188 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 189 StartingOffset + SL->getElementOffset(ElementNum)); 190 ++ElementNum; 191 } 192 return; 193 } 194 195 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 196 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 197 EVT VT = TempVTs[i]; 198 uint64_t Off = TempOffsets[i]; 199 // Split vectors into individual elements, except for v2f16, which 200 // we will pass as a single scalar. 201 if (VT.isVector()) { 202 unsigned NumElts = VT.getVectorNumElements(); 203 EVT EltVT = VT.getVectorElementType(); 204 // Vectors with an even number of f16 elements will be passed to 205 // us as an array of v2f16 elements. We must match this so we 206 // stay in sync with Ins/Outs. 207 if (EltVT == MVT::f16 && NumElts % 2 == 0) { 208 EltVT = MVT::v2f16; 209 NumElts /= 2; 210 } 211 for (unsigned j = 0; j != NumElts; ++j) { 212 ValueVTs.push_back(EltVT); 213 if (Offsets) 214 Offsets->push_back(Off + j * EltVT.getStoreSize()); 215 } 216 } else { 217 ValueVTs.push_back(VT); 218 if (Offsets) 219 Offsets->push_back(Off); 220 } 221 } 222 } 223 224 // Check whether we can merge loads/stores of some of the pieces of a 225 // flattened function parameter or return value into a single vector 226 // load/store. 227 // 228 // The flattened parameter is represented as a list of EVTs and 229 // offsets, and the whole structure is aligned to ParamAlignment. This 230 // function determines whether we can load/store pieces of the 231 // parameter starting at index Idx using a single vectorized op of 232 // size AccessSize. If so, it returns the number of param pieces 233 // covered by the vector op. Otherwise, it returns 1. 234 static unsigned CanMergeParamLoadStoresStartingAt( 235 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 236 const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) { 237 assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); 238 239 // Can't vectorize if param alignment is not sufficient. 240 if (AccessSize > ParamAlignment) 241 return 1; 242 // Can't vectorize if offset is not aligned. 243 if (Offsets[Idx] & (AccessSize - 1)) 244 return 1; 245 246 EVT EltVT = ValueVTs[Idx]; 247 unsigned EltSize = EltVT.getStoreSize(); 248 249 // Element is too large to vectorize. 250 if (EltSize >= AccessSize) 251 return 1; 252 253 unsigned NumElts = AccessSize / EltSize; 254 // Can't vectorize if AccessBytes if not a multiple of EltSize. 255 if (AccessSize != EltSize * NumElts) 256 return 1; 257 258 // We don't have enough elements to vectorize. 259 if (Idx + NumElts > ValueVTs.size()) 260 return 1; 261 262 // PTX ISA can only deal with 2- and 4-element vector ops. 263 if (NumElts != 4 && NumElts != 2) 264 return 1; 265 266 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 267 // Types do not match. 268 if (ValueVTs[j] != EltVT) 269 return 1; 270 271 // Elements are not contiguous. 272 if (Offsets[j] - Offsets[j - 1] != EltSize) 273 return 1; 274 } 275 // OK. We can vectorize ValueVTs[i..i+NumElts) 276 return NumElts; 277 } 278 279 // Flags for tracking per-element vectorization state of loads/stores 280 // of a flattened function parameter or return value. 281 enum ParamVectorizationFlags { 282 PVF_INNER = 0x0, // Middle elements of a vector. 283 PVF_FIRST = 0x1, // First element of the vector. 284 PVF_LAST = 0x2, // Last element of the vector. 285 // Scalar is effectively a 1-element vector. 286 PVF_SCALAR = PVF_FIRST | PVF_LAST 287 }; 288 289 // Computes whether and how we can vectorize the loads/stores of a 290 // flattened function parameter or return value. 291 // 292 // The flattened parameter is represented as the list of ValueVTs and 293 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 294 // of the same size as ValueVTs indicating how each piece should be 295 // loaded/stored (i.e. as a scalar, or as part of a vector 296 // load/store). 297 static SmallVector<ParamVectorizationFlags, 16> 298 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 299 const SmallVectorImpl<uint64_t> &Offsets, 300 unsigned ParamAlignment) { 301 // Set vector size to match ValueVTs and mark all elements as 302 // scalars by default. 303 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 304 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 305 306 // Check what we can vectorize using 128/64/32-bit accesses. 307 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 308 // Skip elements we've already processed. 309 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 310 for (unsigned AccessSize : {16, 8, 4, 2}) { 311 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 312 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 313 // Mark vectorized elements. 314 switch (NumElts) { 315 default: 316 llvm_unreachable("Unexpected return value"); 317 case 1: 318 // Can't vectorize using this size, try next smaller size. 319 continue; 320 case 2: 321 assert(I + 1 < E && "Not enough elements."); 322 VectorInfo[I] = PVF_FIRST; 323 VectorInfo[I + 1] = PVF_LAST; 324 I += 1; 325 break; 326 case 4: 327 assert(I + 3 < E && "Not enough elements."); 328 VectorInfo[I] = PVF_FIRST; 329 VectorInfo[I + 1] = PVF_INNER; 330 VectorInfo[I + 2] = PVF_INNER; 331 VectorInfo[I + 3] = PVF_LAST; 332 I += 3; 333 break; 334 } 335 // Break out of the inner loop because we've already succeeded 336 // using largest possible AccessSize. 337 break; 338 } 339 } 340 return VectorInfo; 341 } 342 343 // NVPTXTargetLowering Constructor. 344 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 345 const NVPTXSubtarget &STI) 346 : TargetLowering(TM), nvTM(&TM), STI(STI) { 347 // always lower memset, memcpy, and memmove intrinsics to load/store 348 // instructions, rather 349 // then generating calls to memset, mempcy or memmove. 350 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 351 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 352 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 353 354 setBooleanContents(ZeroOrNegativeOneBooleanContent); 355 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 356 357 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 358 // condition branches. 359 setJumpIsExpensive(true); 360 361 // Wide divides are _very_ slow. Try to reduce the width of the divide if 362 // possible. 363 addBypassSlowDiv(64, 32); 364 365 // By default, use the Source scheduling 366 if (sched4reg) 367 setSchedulingPreference(Sched::RegPressure); 368 else 369 setSchedulingPreference(Sched::Source); 370 371 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 372 LegalizeAction NoF16Action) { 373 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 374 }; 375 376 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 377 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 378 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 379 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 380 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 381 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 382 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 383 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 384 385 // Conversion to/from FP16/FP16x2 is always legal. 386 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 387 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 388 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 389 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 390 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 391 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 392 393 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 394 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 395 396 // Operations not directly supported by NVPTX. 397 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 398 MVT::i16, MVT::i32, MVT::i64}) { 399 setOperationAction(ISD::SELECT_CC, VT, Expand); 400 setOperationAction(ISD::BR_CC, VT, Expand); 401 } 402 403 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 404 // For others we will expand to a SHL/SRA pair. 405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 408 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 409 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 410 411 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 412 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 413 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 414 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 415 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 416 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 417 418 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 419 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 420 421 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 422 // that don't have h/w rotation we lower them to multi-instruction assembly. 423 // See ROT*_sw in NVPTXIntrInfo.td 424 setOperationAction(ISD::ROTL, MVT::i64, Legal); 425 setOperationAction(ISD::ROTR, MVT::i64, Legal); 426 setOperationAction(ISD::ROTL, MVT::i32, Legal); 427 setOperationAction(ISD::ROTR, MVT::i32, Legal); 428 429 setOperationAction(ISD::ROTL, MVT::i16, Expand); 430 setOperationAction(ISD::ROTR, MVT::i16, Expand); 431 setOperationAction(ISD::ROTL, MVT::i8, Expand); 432 setOperationAction(ISD::ROTR, MVT::i8, Expand); 433 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 434 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 435 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 436 437 // Indirect branch is not supported. 438 // This also disables Jump Table creation. 439 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 440 setOperationAction(ISD::BRIND, MVT::Other, Expand); 441 442 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 443 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 444 445 // We want to legalize constant related memmove and memcopy 446 // intrinsics. 447 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 448 449 // Turn FP extload into load/fpextend 450 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 451 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 452 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 453 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 454 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 455 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 456 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 457 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 458 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 459 // Turn FP truncstore into trunc + store. 460 // FIXME: vector types should also be expanded 461 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 462 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 463 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 464 465 // PTX does not support load / store predicate registers 466 setOperationAction(ISD::LOAD, MVT::i1, Custom); 467 setOperationAction(ISD::STORE, MVT::i1, Custom); 468 469 for (MVT VT : MVT::integer_valuetypes()) { 470 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 471 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 472 setTruncStoreAction(VT, MVT::i1, Expand); 473 } 474 475 // This is legal in NVPTX 476 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 477 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 478 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 479 480 // TRAP can be lowered to PTX trap 481 setOperationAction(ISD::TRAP, MVT::Other, Legal); 482 483 // Register custom handling for vector loads/stores 484 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 485 if (IsPTXVectorType(VT)) { 486 setOperationAction(ISD::LOAD, VT, Custom); 487 setOperationAction(ISD::STORE, VT, Custom); 488 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 489 } 490 } 491 492 // Custom handling for i8 intrinsics 493 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 494 495 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 496 setOperationAction(ISD::ABS, Ty, Legal); 497 setOperationAction(ISD::SMIN, Ty, Legal); 498 setOperationAction(ISD::SMAX, Ty, Legal); 499 setOperationAction(ISD::UMIN, Ty, Legal); 500 setOperationAction(ISD::UMAX, Ty, Legal); 501 502 setOperationAction(ISD::CTPOP, Ty, Legal); 503 setOperationAction(ISD::CTLZ, Ty, Legal); 504 } 505 506 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 507 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 508 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 509 510 // PTX does not directly support SELP of i1, so promote to i32 first 511 setOperationAction(ISD::SELECT, MVT::i1, Custom); 512 513 // PTX cannot multiply two i64s in a single instruction. 514 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 515 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 516 517 // We have some custom DAG combine patterns for these nodes 518 setTargetDAGCombine(ISD::ADD); 519 setTargetDAGCombine(ISD::AND); 520 setTargetDAGCombine(ISD::FADD); 521 setTargetDAGCombine(ISD::MUL); 522 setTargetDAGCombine(ISD::SHL); 523 setTargetDAGCombine(ISD::SREM); 524 setTargetDAGCombine(ISD::UREM); 525 526 // setcc for f16x2 needs special handling to prevent legalizer's 527 // attempt to scalarize it due to v2i1 not being legal. 528 if (STI.allowFP16Math()) 529 setTargetDAGCombine(ISD::SETCC); 530 531 // Promote fp16 arithmetic if fp16 hardware isn't available or the 532 // user passed --nvptx-no-fp16-math. The flag is useful because, 533 // although sm_53+ GPUs have some sort of FP16 support in 534 // hardware, only sm_53 and sm_60 have full implementation. Others 535 // only have token amount of hardware and are likely to run faster 536 // by using fp32 units instead. 537 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 538 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 539 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 540 } 541 542 // There's no neg.f16 instruction. Expand to (0-x). 543 setOperationAction(ISD::FNEG, MVT::f16, Expand); 544 setOperationAction(ISD::FNEG, MVT::v2f16, Expand); 545 546 // (would be) Library functions. 547 548 // These map to conversion instructions for scalar FP types. 549 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 550 ISD::FTRUNC}) { 551 setOperationAction(Op, MVT::f16, Legal); 552 setOperationAction(Op, MVT::f32, Legal); 553 setOperationAction(Op, MVT::f64, Legal); 554 setOperationAction(Op, MVT::v2f16, Expand); 555 } 556 557 setOperationAction(ISD::FROUND, MVT::f16, Promote); 558 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 559 setOperationAction(ISD::FROUND, MVT::f32, Custom); 560 setOperationAction(ISD::FROUND, MVT::f64, Custom); 561 562 563 // 'Expand' implements FCOPYSIGN without calling an external library. 564 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 565 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 567 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 568 569 // These map to corresponding instructions for f32/f64. f16 must be 570 // promoted to f32. v2f16 is expanded to f16, which is then promoted 571 // to f32. 572 for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, 573 ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { 574 setOperationAction(Op, MVT::f16, Promote); 575 setOperationAction(Op, MVT::f32, Legal); 576 setOperationAction(Op, MVT::f64, Legal); 577 setOperationAction(Op, MVT::v2f16, Expand); 578 } 579 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 580 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 581 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 582 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 583 584 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 585 // No FPOW or FREM in PTX. 586 587 // Now deduce the information based on the above mentioned 588 // actions 589 computeRegisterProperties(STI.getRegisterInfo()); 590 } 591 592 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 593 switch ((NVPTXISD::NodeType)Opcode) { 594 case NVPTXISD::FIRST_NUMBER: 595 break; 596 case NVPTXISD::CALL: 597 return "NVPTXISD::CALL"; 598 case NVPTXISD::RET_FLAG: 599 return "NVPTXISD::RET_FLAG"; 600 case NVPTXISD::LOAD_PARAM: 601 return "NVPTXISD::LOAD_PARAM"; 602 case NVPTXISD::Wrapper: 603 return "NVPTXISD::Wrapper"; 604 case NVPTXISD::DeclareParam: 605 return "NVPTXISD::DeclareParam"; 606 case NVPTXISD::DeclareScalarParam: 607 return "NVPTXISD::DeclareScalarParam"; 608 case NVPTXISD::DeclareRet: 609 return "NVPTXISD::DeclareRet"; 610 case NVPTXISD::DeclareScalarRet: 611 return "NVPTXISD::DeclareScalarRet"; 612 case NVPTXISD::DeclareRetParam: 613 return "NVPTXISD::DeclareRetParam"; 614 case NVPTXISD::PrintCall: 615 return "NVPTXISD::PrintCall"; 616 case NVPTXISD::PrintConvergentCall: 617 return "NVPTXISD::PrintConvergentCall"; 618 case NVPTXISD::PrintCallUni: 619 return "NVPTXISD::PrintCallUni"; 620 case NVPTXISD::PrintConvergentCallUni: 621 return "NVPTXISD::PrintConvergentCallUni"; 622 case NVPTXISD::LoadParam: 623 return "NVPTXISD::LoadParam"; 624 case NVPTXISD::LoadParamV2: 625 return "NVPTXISD::LoadParamV2"; 626 case NVPTXISD::LoadParamV4: 627 return "NVPTXISD::LoadParamV4"; 628 case NVPTXISD::StoreParam: 629 return "NVPTXISD::StoreParam"; 630 case NVPTXISD::StoreParamV2: 631 return "NVPTXISD::StoreParamV2"; 632 case NVPTXISD::StoreParamV4: 633 return "NVPTXISD::StoreParamV4"; 634 case NVPTXISD::StoreParamS32: 635 return "NVPTXISD::StoreParamS32"; 636 case NVPTXISD::StoreParamU32: 637 return "NVPTXISD::StoreParamU32"; 638 case NVPTXISD::CallArgBegin: 639 return "NVPTXISD::CallArgBegin"; 640 case NVPTXISD::CallArg: 641 return "NVPTXISD::CallArg"; 642 case NVPTXISD::LastCallArg: 643 return "NVPTXISD::LastCallArg"; 644 case NVPTXISD::CallArgEnd: 645 return "NVPTXISD::CallArgEnd"; 646 case NVPTXISD::CallVoid: 647 return "NVPTXISD::CallVoid"; 648 case NVPTXISD::CallVal: 649 return "NVPTXISD::CallVal"; 650 case NVPTXISD::CallSymbol: 651 return "NVPTXISD::CallSymbol"; 652 case NVPTXISD::Prototype: 653 return "NVPTXISD::Prototype"; 654 case NVPTXISD::MoveParam: 655 return "NVPTXISD::MoveParam"; 656 case NVPTXISD::StoreRetval: 657 return "NVPTXISD::StoreRetval"; 658 case NVPTXISD::StoreRetvalV2: 659 return "NVPTXISD::StoreRetvalV2"; 660 case NVPTXISD::StoreRetvalV4: 661 return "NVPTXISD::StoreRetvalV4"; 662 case NVPTXISD::PseudoUseParam: 663 return "NVPTXISD::PseudoUseParam"; 664 case NVPTXISD::RETURN: 665 return "NVPTXISD::RETURN"; 666 case NVPTXISD::CallSeqBegin: 667 return "NVPTXISD::CallSeqBegin"; 668 case NVPTXISD::CallSeqEnd: 669 return "NVPTXISD::CallSeqEnd"; 670 case NVPTXISD::CallPrototype: 671 return "NVPTXISD::CallPrototype"; 672 case NVPTXISD::ProxyReg: 673 return "NVPTXISD::ProxyReg"; 674 case NVPTXISD::LoadV2: 675 return "NVPTXISD::LoadV2"; 676 case NVPTXISD::LoadV4: 677 return "NVPTXISD::LoadV4"; 678 case NVPTXISD::LDGV2: 679 return "NVPTXISD::LDGV2"; 680 case NVPTXISD::LDGV4: 681 return "NVPTXISD::LDGV4"; 682 case NVPTXISD::LDUV2: 683 return "NVPTXISD::LDUV2"; 684 case NVPTXISD::LDUV4: 685 return "NVPTXISD::LDUV4"; 686 case NVPTXISD::StoreV2: 687 return "NVPTXISD::StoreV2"; 688 case NVPTXISD::StoreV4: 689 return "NVPTXISD::StoreV4"; 690 case NVPTXISD::FUN_SHFL_CLAMP: 691 return "NVPTXISD::FUN_SHFL_CLAMP"; 692 case NVPTXISD::FUN_SHFR_CLAMP: 693 return "NVPTXISD::FUN_SHFR_CLAMP"; 694 case NVPTXISD::IMAD: 695 return "NVPTXISD::IMAD"; 696 case NVPTXISD::SETP_F16X2: 697 return "NVPTXISD::SETP_F16X2"; 698 case NVPTXISD::Dummy: 699 return "NVPTXISD::Dummy"; 700 case NVPTXISD::MUL_WIDE_SIGNED: 701 return "NVPTXISD::MUL_WIDE_SIGNED"; 702 case NVPTXISD::MUL_WIDE_UNSIGNED: 703 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 704 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 705 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 706 case NVPTXISD::Tex1DFloatFloatLevel: 707 return "NVPTXISD::Tex1DFloatFloatLevel"; 708 case NVPTXISD::Tex1DFloatFloatGrad: 709 return "NVPTXISD::Tex1DFloatFloatGrad"; 710 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 711 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 712 case NVPTXISD::Tex1DS32FloatLevel: 713 return "NVPTXISD::Tex1DS32FloatLevel"; 714 case NVPTXISD::Tex1DS32FloatGrad: 715 return "NVPTXISD::Tex1DS32FloatGrad"; 716 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 717 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 718 case NVPTXISD::Tex1DU32FloatLevel: 719 return "NVPTXISD::Tex1DU32FloatLevel"; 720 case NVPTXISD::Tex1DU32FloatGrad: 721 return "NVPTXISD::Tex1DU32FloatGrad"; 722 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 723 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 724 case NVPTXISD::Tex1DArrayFloatFloatLevel: 725 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 726 case NVPTXISD::Tex1DArrayFloatFloatGrad: 727 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 728 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 729 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 730 case NVPTXISD::Tex1DArrayS32FloatLevel: 731 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 732 case NVPTXISD::Tex1DArrayS32FloatGrad: 733 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 734 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 735 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 736 case NVPTXISD::Tex1DArrayU32FloatLevel: 737 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 738 case NVPTXISD::Tex1DArrayU32FloatGrad: 739 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 740 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 741 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 742 case NVPTXISD::Tex2DFloatFloatLevel: 743 return "NVPTXISD::Tex2DFloatFloatLevel"; 744 case NVPTXISD::Tex2DFloatFloatGrad: 745 return "NVPTXISD::Tex2DFloatFloatGrad"; 746 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 747 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 748 case NVPTXISD::Tex2DS32FloatLevel: 749 return "NVPTXISD::Tex2DS32FloatLevel"; 750 case NVPTXISD::Tex2DS32FloatGrad: 751 return "NVPTXISD::Tex2DS32FloatGrad"; 752 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 753 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 754 case NVPTXISD::Tex2DU32FloatLevel: 755 return "NVPTXISD::Tex2DU32FloatLevel"; 756 case NVPTXISD::Tex2DU32FloatGrad: 757 return "NVPTXISD::Tex2DU32FloatGrad"; 758 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 759 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 760 case NVPTXISD::Tex2DArrayFloatFloatLevel: 761 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 762 case NVPTXISD::Tex2DArrayFloatFloatGrad: 763 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 764 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 765 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 766 case NVPTXISD::Tex2DArrayS32FloatLevel: 767 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 768 case NVPTXISD::Tex2DArrayS32FloatGrad: 769 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 770 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 771 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 772 case NVPTXISD::Tex2DArrayU32FloatLevel: 773 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 774 case NVPTXISD::Tex2DArrayU32FloatGrad: 775 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 776 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 777 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 778 case NVPTXISD::Tex3DFloatFloatLevel: 779 return "NVPTXISD::Tex3DFloatFloatLevel"; 780 case NVPTXISD::Tex3DFloatFloatGrad: 781 return "NVPTXISD::Tex3DFloatFloatGrad"; 782 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 783 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 784 case NVPTXISD::Tex3DS32FloatLevel: 785 return "NVPTXISD::Tex3DS32FloatLevel"; 786 case NVPTXISD::Tex3DS32FloatGrad: 787 return "NVPTXISD::Tex3DS32FloatGrad"; 788 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 789 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 790 case NVPTXISD::Tex3DU32FloatLevel: 791 return "NVPTXISD::Tex3DU32FloatLevel"; 792 case NVPTXISD::Tex3DU32FloatGrad: 793 return "NVPTXISD::Tex3DU32FloatGrad"; 794 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 795 case NVPTXISD::TexCubeFloatFloatLevel: 796 return "NVPTXISD::TexCubeFloatFloatLevel"; 797 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 798 case NVPTXISD::TexCubeS32FloatLevel: 799 return "NVPTXISD::TexCubeS32FloatLevel"; 800 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 801 case NVPTXISD::TexCubeU32FloatLevel: 802 return "NVPTXISD::TexCubeU32FloatLevel"; 803 case NVPTXISD::TexCubeArrayFloatFloat: 804 return "NVPTXISD::TexCubeArrayFloatFloat"; 805 case NVPTXISD::TexCubeArrayFloatFloatLevel: 806 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 807 case NVPTXISD::TexCubeArrayS32Float: 808 return "NVPTXISD::TexCubeArrayS32Float"; 809 case NVPTXISD::TexCubeArrayS32FloatLevel: 810 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 811 case NVPTXISD::TexCubeArrayU32Float: 812 return "NVPTXISD::TexCubeArrayU32Float"; 813 case NVPTXISD::TexCubeArrayU32FloatLevel: 814 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 815 case NVPTXISD::Tld4R2DFloatFloat: 816 return "NVPTXISD::Tld4R2DFloatFloat"; 817 case NVPTXISD::Tld4G2DFloatFloat: 818 return "NVPTXISD::Tld4G2DFloatFloat"; 819 case NVPTXISD::Tld4B2DFloatFloat: 820 return "NVPTXISD::Tld4B2DFloatFloat"; 821 case NVPTXISD::Tld4A2DFloatFloat: 822 return "NVPTXISD::Tld4A2DFloatFloat"; 823 case NVPTXISD::Tld4R2DS64Float: 824 return "NVPTXISD::Tld4R2DS64Float"; 825 case NVPTXISD::Tld4G2DS64Float: 826 return "NVPTXISD::Tld4G2DS64Float"; 827 case NVPTXISD::Tld4B2DS64Float: 828 return "NVPTXISD::Tld4B2DS64Float"; 829 case NVPTXISD::Tld4A2DS64Float: 830 return "NVPTXISD::Tld4A2DS64Float"; 831 case NVPTXISD::Tld4R2DU64Float: 832 return "NVPTXISD::Tld4R2DU64Float"; 833 case NVPTXISD::Tld4G2DU64Float: 834 return "NVPTXISD::Tld4G2DU64Float"; 835 case NVPTXISD::Tld4B2DU64Float: 836 return "NVPTXISD::Tld4B2DU64Float"; 837 case NVPTXISD::Tld4A2DU64Float: 838 return "NVPTXISD::Tld4A2DU64Float"; 839 840 case NVPTXISD::TexUnified1DFloatS32: 841 return "NVPTXISD::TexUnified1DFloatS32"; 842 case NVPTXISD::TexUnified1DFloatFloat: 843 return "NVPTXISD::TexUnified1DFloatFloat"; 844 case NVPTXISD::TexUnified1DFloatFloatLevel: 845 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 846 case NVPTXISD::TexUnified1DFloatFloatGrad: 847 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 848 case NVPTXISD::TexUnified1DS32S32: 849 return "NVPTXISD::TexUnified1DS32S32"; 850 case NVPTXISD::TexUnified1DS32Float: 851 return "NVPTXISD::TexUnified1DS32Float"; 852 case NVPTXISD::TexUnified1DS32FloatLevel: 853 return "NVPTXISD::TexUnified1DS32FloatLevel"; 854 case NVPTXISD::TexUnified1DS32FloatGrad: 855 return "NVPTXISD::TexUnified1DS32FloatGrad"; 856 case NVPTXISD::TexUnified1DU32S32: 857 return "NVPTXISD::TexUnified1DU32S32"; 858 case NVPTXISD::TexUnified1DU32Float: 859 return "NVPTXISD::TexUnified1DU32Float"; 860 case NVPTXISD::TexUnified1DU32FloatLevel: 861 return "NVPTXISD::TexUnified1DU32FloatLevel"; 862 case NVPTXISD::TexUnified1DU32FloatGrad: 863 return "NVPTXISD::TexUnified1DU32FloatGrad"; 864 case NVPTXISD::TexUnified1DArrayFloatS32: 865 return "NVPTXISD::TexUnified1DArrayFloatS32"; 866 case NVPTXISD::TexUnified1DArrayFloatFloat: 867 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 868 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 869 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 870 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 871 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 872 case NVPTXISD::TexUnified1DArrayS32S32: 873 return "NVPTXISD::TexUnified1DArrayS32S32"; 874 case NVPTXISD::TexUnified1DArrayS32Float: 875 return "NVPTXISD::TexUnified1DArrayS32Float"; 876 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 877 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 878 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 879 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 880 case NVPTXISD::TexUnified1DArrayU32S32: 881 return "NVPTXISD::TexUnified1DArrayU32S32"; 882 case NVPTXISD::TexUnified1DArrayU32Float: 883 return "NVPTXISD::TexUnified1DArrayU32Float"; 884 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 885 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 886 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 887 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 888 case NVPTXISD::TexUnified2DFloatS32: 889 return "NVPTXISD::TexUnified2DFloatS32"; 890 case NVPTXISD::TexUnified2DFloatFloat: 891 return "NVPTXISD::TexUnified2DFloatFloat"; 892 case NVPTXISD::TexUnified2DFloatFloatLevel: 893 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 894 case NVPTXISD::TexUnified2DFloatFloatGrad: 895 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 896 case NVPTXISD::TexUnified2DS32S32: 897 return "NVPTXISD::TexUnified2DS32S32"; 898 case NVPTXISD::TexUnified2DS32Float: 899 return "NVPTXISD::TexUnified2DS32Float"; 900 case NVPTXISD::TexUnified2DS32FloatLevel: 901 return "NVPTXISD::TexUnified2DS32FloatLevel"; 902 case NVPTXISD::TexUnified2DS32FloatGrad: 903 return "NVPTXISD::TexUnified2DS32FloatGrad"; 904 case NVPTXISD::TexUnified2DU32S32: 905 return "NVPTXISD::TexUnified2DU32S32"; 906 case NVPTXISD::TexUnified2DU32Float: 907 return "NVPTXISD::TexUnified2DU32Float"; 908 case NVPTXISD::TexUnified2DU32FloatLevel: 909 return "NVPTXISD::TexUnified2DU32FloatLevel"; 910 case NVPTXISD::TexUnified2DU32FloatGrad: 911 return "NVPTXISD::TexUnified2DU32FloatGrad"; 912 case NVPTXISD::TexUnified2DArrayFloatS32: 913 return "NVPTXISD::TexUnified2DArrayFloatS32"; 914 case NVPTXISD::TexUnified2DArrayFloatFloat: 915 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 916 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 917 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 918 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 919 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 920 case NVPTXISD::TexUnified2DArrayS32S32: 921 return "NVPTXISD::TexUnified2DArrayS32S32"; 922 case NVPTXISD::TexUnified2DArrayS32Float: 923 return "NVPTXISD::TexUnified2DArrayS32Float"; 924 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 925 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 926 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 927 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 928 case NVPTXISD::TexUnified2DArrayU32S32: 929 return "NVPTXISD::TexUnified2DArrayU32S32"; 930 case NVPTXISD::TexUnified2DArrayU32Float: 931 return "NVPTXISD::TexUnified2DArrayU32Float"; 932 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 933 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 934 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 935 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 936 case NVPTXISD::TexUnified3DFloatS32: 937 return "NVPTXISD::TexUnified3DFloatS32"; 938 case NVPTXISD::TexUnified3DFloatFloat: 939 return "NVPTXISD::TexUnified3DFloatFloat"; 940 case NVPTXISD::TexUnified3DFloatFloatLevel: 941 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 942 case NVPTXISD::TexUnified3DFloatFloatGrad: 943 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 944 case NVPTXISD::TexUnified3DS32S32: 945 return "NVPTXISD::TexUnified3DS32S32"; 946 case NVPTXISD::TexUnified3DS32Float: 947 return "NVPTXISD::TexUnified3DS32Float"; 948 case NVPTXISD::TexUnified3DS32FloatLevel: 949 return "NVPTXISD::TexUnified3DS32FloatLevel"; 950 case NVPTXISD::TexUnified3DS32FloatGrad: 951 return "NVPTXISD::TexUnified3DS32FloatGrad"; 952 case NVPTXISD::TexUnified3DU32S32: 953 return "NVPTXISD::TexUnified3DU32S32"; 954 case NVPTXISD::TexUnified3DU32Float: 955 return "NVPTXISD::TexUnified3DU32Float"; 956 case NVPTXISD::TexUnified3DU32FloatLevel: 957 return "NVPTXISD::TexUnified3DU32FloatLevel"; 958 case NVPTXISD::TexUnified3DU32FloatGrad: 959 return "NVPTXISD::TexUnified3DU32FloatGrad"; 960 case NVPTXISD::TexUnifiedCubeFloatFloat: 961 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 962 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 963 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 964 case NVPTXISD::TexUnifiedCubeS32Float: 965 return "NVPTXISD::TexUnifiedCubeS32Float"; 966 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 967 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 968 case NVPTXISD::TexUnifiedCubeU32Float: 969 return "NVPTXISD::TexUnifiedCubeU32Float"; 970 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 971 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 972 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 973 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 974 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 975 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 976 case NVPTXISD::TexUnifiedCubeArrayS32Float: 977 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 978 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 979 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 980 case NVPTXISD::TexUnifiedCubeArrayU32Float: 981 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 982 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 983 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 984 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 985 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 986 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 987 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 988 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 989 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 990 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 991 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 992 case NVPTXISD::Tld4UnifiedR2DS64Float: 993 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 994 case NVPTXISD::Tld4UnifiedG2DS64Float: 995 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 996 case NVPTXISD::Tld4UnifiedB2DS64Float: 997 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 998 case NVPTXISD::Tld4UnifiedA2DS64Float: 999 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1000 case NVPTXISD::Tld4UnifiedR2DU64Float: 1001 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1002 case NVPTXISD::Tld4UnifiedG2DU64Float: 1003 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1004 case NVPTXISD::Tld4UnifiedB2DU64Float: 1005 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1006 case NVPTXISD::Tld4UnifiedA2DU64Float: 1007 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1008 1009 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1010 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1011 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1012 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1013 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1014 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1015 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1016 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1017 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1018 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1019 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1020 1021 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1022 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1023 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1024 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1025 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1026 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1027 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1028 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1029 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1030 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1031 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1032 1033 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1034 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1035 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1036 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1037 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1038 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1039 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1040 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1041 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1042 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1043 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1044 1045 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1046 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1047 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1048 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1049 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1050 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1051 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1052 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1053 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1054 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1055 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1056 1057 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1058 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1059 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1060 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1061 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1062 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1063 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1064 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1065 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1066 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1067 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1068 1069 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1070 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1071 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1072 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1073 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1074 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1075 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1076 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1077 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1078 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1079 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1080 1081 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1082 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1083 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1084 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1085 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1086 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1087 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1088 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1089 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1090 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1091 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1092 1093 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1094 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1095 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1096 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1097 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1098 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1099 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1100 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1101 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1102 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1103 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1104 1105 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1106 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1107 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1108 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1109 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1110 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1111 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1112 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1113 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1114 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1115 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1116 1117 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1118 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1119 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1120 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1121 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1122 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1123 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1124 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1125 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1126 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1127 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1128 1129 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1130 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1131 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1132 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1133 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1134 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1135 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1136 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1137 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1138 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1139 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1140 1141 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1142 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1143 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1144 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1145 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1146 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1147 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1148 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1149 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1150 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1151 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1152 1153 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1154 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1155 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1156 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1157 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1158 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1159 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1160 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1161 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1162 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1163 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1164 1165 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1166 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1167 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1168 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1169 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1170 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1171 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1172 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1173 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1174 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1175 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1176 1177 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1178 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1179 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1180 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1181 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1182 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1183 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1184 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1185 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1186 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1187 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1188 } 1189 return nullptr; 1190 } 1191 1192 TargetLoweringBase::LegalizeTypeAction 1193 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1194 if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) 1195 return TypeSplitVector; 1196 if (VT == MVT::v2f16) 1197 return TypeLegal; 1198 return TargetLoweringBase::getPreferredVectorAction(VT); 1199 } 1200 1201 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1202 int Enabled, int &ExtraSteps, 1203 bool &UseOneConst, 1204 bool Reciprocal) const { 1205 if (!(Enabled == ReciprocalEstimate::Enabled || 1206 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1207 return SDValue(); 1208 1209 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1210 ExtraSteps = 0; 1211 1212 SDLoc DL(Operand); 1213 EVT VT = Operand.getValueType(); 1214 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1215 1216 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1217 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1218 DAG.getConstant(IID, DL, MVT::i32), Operand); 1219 }; 1220 1221 // The sqrt and rsqrt refinement processes assume we always start out with an 1222 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1223 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1224 // any refinement, we must return a regular sqrt. 1225 if (Reciprocal || ExtraSteps > 0) { 1226 if (VT == MVT::f32) 1227 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1228 : Intrinsic::nvvm_rsqrt_approx_f); 1229 else if (VT == MVT::f64) 1230 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1231 else 1232 return SDValue(); 1233 } else { 1234 if (VT == MVT::f32) 1235 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1236 : Intrinsic::nvvm_sqrt_approx_f); 1237 else { 1238 // There's no sqrt.approx.f64 instruction, so we emit 1239 // reciprocal(rsqrt(x)). This is faster than 1240 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1241 // x * rsqrt(x).) 1242 return DAG.getNode( 1243 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1244 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1245 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1246 } 1247 } 1248 } 1249 1250 SDValue 1251 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1252 SDLoc dl(Op); 1253 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1254 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1255 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1256 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1257 } 1258 1259 std::string NVPTXTargetLowering::getPrototype( 1260 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1261 const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment, 1262 ImmutableCallSite CS) const { 1263 auto PtrVT = getPointerTy(DL); 1264 1265 bool isABI = (STI.getSmVersion() >= 20); 1266 assert(isABI && "Non-ABI compilation is not supported"); 1267 if (!isABI) 1268 return ""; 1269 1270 std::stringstream O; 1271 O << "prototype_" << uniqueCallSite << " : .callprototype "; 1272 1273 if (retTy->getTypeID() == Type::VoidTyID) { 1274 O << "()"; 1275 } else { 1276 O << "("; 1277 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1278 unsigned size = 0; 1279 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1280 size = ITy->getBitWidth(); 1281 } else { 1282 assert(retTy->isFloatingPointTy() && 1283 "Floating point type expected here"); 1284 size = retTy->getPrimitiveSizeInBits(); 1285 } 1286 // PTX ABI requires all scalar return values to be at least 32 1287 // bits in size. fp16 normally uses .b16 as its storage type in 1288 // PTX, so its size must be adjusted here, too. 1289 if (size < 32) 1290 size = 32; 1291 1292 O << ".param .b" << size << " _"; 1293 } else if (isa<PointerType>(retTy)) { 1294 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1295 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1296 retTy->isIntegerTy(128)) { 1297 O << ".param .align " << retAlignment << " .b8 _[" 1298 << DL.getTypeAllocSize(retTy) << "]"; 1299 } else { 1300 llvm_unreachable("Unknown return type"); 1301 } 1302 O << ") "; 1303 } 1304 O << "_ ("; 1305 1306 bool first = true; 1307 1308 unsigned OIdx = 0; 1309 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1310 Type *Ty = Args[i].Ty; 1311 if (!first) { 1312 O << ", "; 1313 } 1314 first = false; 1315 1316 if (!Outs[OIdx].Flags.isByVal()) { 1317 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1318 unsigned align = 0; 1319 const CallInst *CallI = cast<CallInst>(CS.getInstruction()); 1320 // +1 because index 0 is reserved for return type alignment 1321 if (!getAlign(*CallI, i + 1, align)) 1322 align = DL.getABITypeAlignment(Ty); 1323 unsigned sz = DL.getTypeAllocSize(Ty); 1324 O << ".param .align " << align << " .b8 "; 1325 O << "_"; 1326 O << "[" << sz << "]"; 1327 // update the index for Outs 1328 SmallVector<EVT, 16> vtparts; 1329 ComputeValueVTs(*this, DL, Ty, vtparts); 1330 if (unsigned len = vtparts.size()) 1331 OIdx += len - 1; 1332 continue; 1333 } 1334 // i8 types in IR will be i16 types in SDAG 1335 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1336 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1337 "type mismatch between callee prototype and arguments"); 1338 // scalar type 1339 unsigned sz = 0; 1340 if (isa<IntegerType>(Ty)) { 1341 sz = cast<IntegerType>(Ty)->getBitWidth(); 1342 if (sz < 32) 1343 sz = 32; 1344 } else if (isa<PointerType>(Ty)) { 1345 sz = PtrVT.getSizeInBits(); 1346 } else if (Ty->isHalfTy()) 1347 // PTX ABI requires all scalar parameters to be at least 32 1348 // bits in size. fp16 normally uses .b16 as its storage type 1349 // in PTX, so its size must be adjusted here, too. 1350 sz = 32; 1351 else 1352 sz = Ty->getPrimitiveSizeInBits(); 1353 O << ".param .b" << sz << " "; 1354 O << "_"; 1355 continue; 1356 } 1357 auto *PTy = dyn_cast<PointerType>(Ty); 1358 assert(PTy && "Param with byval attribute should be a pointer type"); 1359 Type *ETy = PTy->getElementType(); 1360 1361 unsigned align = Outs[OIdx].Flags.getByValAlign(); 1362 unsigned sz = DL.getTypeAllocSize(ETy); 1363 O << ".param .align " << align << " .b8 "; 1364 O << "_"; 1365 O << "[" << sz << "]"; 1366 } 1367 O << ");"; 1368 return O.str(); 1369 } 1370 1371 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1372 ImmutableCallSite CS, 1373 Type *Ty, unsigned Idx, 1374 const DataLayout &DL) const { 1375 if (!CS) { 1376 // CallSite is zero, fallback to ABI type alignment 1377 return DL.getABITypeAlignment(Ty); 1378 } 1379 1380 unsigned Align = 0; 1381 const Value *DirectCallee = CS.getCalledFunction(); 1382 1383 if (!DirectCallee) { 1384 // We don't have a direct function symbol, but that may be because of 1385 // constant cast instructions in the call. 1386 const Instruction *CalleeI = CS.getInstruction(); 1387 assert(CalleeI && "Call target is not a function or derived value?"); 1388 1389 // With bitcast'd call targets, the instruction will be the call 1390 if (isa<CallInst>(CalleeI)) { 1391 // Check if we have call alignment metadata 1392 if (getAlign(*cast<CallInst>(CalleeI), Idx, Align)) 1393 return Align; 1394 1395 const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); 1396 // Ignore any bitcast instructions 1397 while (isa<ConstantExpr>(CalleeV)) { 1398 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 1399 if (!CE->isCast()) 1400 break; 1401 // Look through the bitcast 1402 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 1403 } 1404 1405 // We have now looked past all of the bitcasts. Do we finally have a 1406 // Function? 1407 if (isa<Function>(CalleeV)) 1408 DirectCallee = CalleeV; 1409 } 1410 } 1411 1412 // Check for function alignment information if we found that the 1413 // ultimate target is a Function 1414 if (DirectCallee) 1415 if (getAlign(*cast<Function>(DirectCallee), Idx, Align)) 1416 return Align; 1417 1418 // Call is indirect or alignment information is not available, fall back to 1419 // the ABI type alignment 1420 return DL.getABITypeAlignment(Ty); 1421 } 1422 1423 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1424 SmallVectorImpl<SDValue> &InVals) const { 1425 SelectionDAG &DAG = CLI.DAG; 1426 SDLoc dl = CLI.DL; 1427 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1428 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1429 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1430 SDValue Chain = CLI.Chain; 1431 SDValue Callee = CLI.Callee; 1432 bool &isTailCall = CLI.IsTailCall; 1433 ArgListTy &Args = CLI.getArgs(); 1434 Type *RetTy = CLI.RetTy; 1435 ImmutableCallSite CS = CLI.CS; 1436 const DataLayout &DL = DAG.getDataLayout(); 1437 1438 bool isABI = (STI.getSmVersion() >= 20); 1439 assert(isABI && "Non-ABI compilation is not supported"); 1440 if (!isABI) 1441 return Chain; 1442 1443 SDValue tempChain = Chain; 1444 Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); 1445 SDValue InFlag = Chain.getValue(1); 1446 1447 unsigned paramCount = 0; 1448 // Args.size() and Outs.size() need not match. 1449 // Outs.size() will be larger 1450 // * if there is an aggregate argument with multiple fields (each field 1451 // showing up separately in Outs) 1452 // * if there is a vector argument with more than typical vector-length 1453 // elements (generally if more than 4) where each vector element is 1454 // individually present in Outs. 1455 // So a different index should be used for indexing into Outs/OutVals. 1456 // See similar issue in LowerFormalArguments. 1457 unsigned OIdx = 0; 1458 // Declare the .params or .reg need to pass values 1459 // to the function 1460 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1461 EVT VT = Outs[OIdx].VT; 1462 Type *Ty = Args[i].Ty; 1463 1464 if (!Outs[OIdx].Flags.isByVal()) { 1465 SmallVector<EVT, 16> VTs; 1466 SmallVector<uint64_t, 16> Offsets; 1467 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); 1468 unsigned ArgAlign = 1469 getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); 1470 unsigned AllocSize = DL.getTypeAllocSize(Ty); 1471 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1472 bool NeedAlign; // Does argument declaration specify alignment? 1473 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1474 // declare .param .align <align> .b8 .param<n>[<size>]; 1475 SDValue DeclareParamOps[] = { 1476 Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), 1477 DAG.getConstant(paramCount, dl, MVT::i32), 1478 DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; 1479 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1480 DeclareParamOps); 1481 NeedAlign = true; 1482 } else { 1483 // declare .param .b<size> .param<n>; 1484 if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { 1485 // PTX ABI requires integral types to be at least 32 bits in 1486 // size. FP16 is loaded/stored using i16, so it's handled 1487 // here as well. 1488 AllocSize = 4; 1489 } 1490 SDValue DeclareScalarParamOps[] = { 1491 Chain, DAG.getConstant(paramCount, dl, MVT::i32), 1492 DAG.getConstant(AllocSize * 8, dl, MVT::i32), 1493 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1494 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1495 DeclareScalarParamOps); 1496 NeedAlign = false; 1497 } 1498 InFlag = Chain.getValue(1); 1499 1500 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1501 // than 32-bits are sign extended or zero extended, depending on 1502 // whether they are signed or unsigned types. This case applies 1503 // only to scalar parameters and not to aggregate values. 1504 bool ExtendIntegerParam = 1505 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1506 1507 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 1508 SmallVector<SDValue, 6> StoreOperands; 1509 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1510 // New store. 1511 if (VectorInfo[j] & PVF_FIRST) { 1512 assert(StoreOperands.empty() && "Unfinished preceding store."); 1513 StoreOperands.push_back(Chain); 1514 StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); 1515 StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); 1516 } 1517 1518 EVT EltVT = VTs[j]; 1519 SDValue StVal = OutVals[OIdx]; 1520 if (ExtendIntegerParam) { 1521 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1522 // zext/sext to i32 1523 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1524 : ISD::ZERO_EXTEND, 1525 dl, MVT::i32, StVal); 1526 } else if (EltVT.getSizeInBits() < 16) { 1527 // Use 16-bit registers for small stores as it's the 1528 // smallest general purpose register size supported by NVPTX. 1529 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1530 } 1531 1532 // Record the value to store. 1533 StoreOperands.push_back(StVal); 1534 1535 if (VectorInfo[j] & PVF_LAST) { 1536 unsigned NumElts = StoreOperands.size() - 3; 1537 NVPTXISD::NodeType Op; 1538 switch (NumElts) { 1539 case 1: 1540 Op = NVPTXISD::StoreParam; 1541 break; 1542 case 2: 1543 Op = NVPTXISD::StoreParamV2; 1544 break; 1545 case 4: 1546 Op = NVPTXISD::StoreParamV4; 1547 break; 1548 default: 1549 llvm_unreachable("Invalid vector info."); 1550 } 1551 1552 StoreOperands.push_back(InFlag); 1553 1554 // Adjust type of the store op if we've extended the scalar 1555 // return value. 1556 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; 1557 unsigned EltAlign = 1558 NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; 1559 1560 Chain = DAG.getMemIntrinsicNode( 1561 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1562 TheStoreType, MachinePointerInfo(), EltAlign, 1563 MachineMemOperand::MOStore); 1564 InFlag = Chain.getValue(1); 1565 1566 // Cleanup. 1567 StoreOperands.clear(); 1568 } 1569 ++OIdx; 1570 } 1571 assert(StoreOperands.empty() && "Unfinished parameter store."); 1572 if (VTs.size() > 0) 1573 --OIdx; 1574 ++paramCount; 1575 continue; 1576 } 1577 1578 // ByVal arguments 1579 SmallVector<EVT, 16> VTs; 1580 SmallVector<uint64_t, 16> Offsets; 1581 auto *PTy = dyn_cast<PointerType>(Args[i].Ty); 1582 assert(PTy && "Type of a byval parameter should be pointer"); 1583 ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); 1584 1585 // declare .param .align <align> .b8 .param<n>[<size>]; 1586 unsigned sz = Outs[OIdx].Flags.getByValSize(); 1587 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1588 unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); 1589 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, 1590 // so we don't need to worry about natural alignment or not. 1591 // See TargetLowering::LowerCallTo(). 1592 1593 // Enforce minumum alignment of 4 to work around ptxas miscompile 1594 // for sm_50+. See corresponding alignment adjustment in 1595 // emitFunctionParamList() for details. 1596 if (ArgAlign < 4) 1597 ArgAlign = 4; 1598 SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), 1599 DAG.getConstant(paramCount, dl, MVT::i32), 1600 DAG.getConstant(sz, dl, MVT::i32), InFlag}; 1601 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1602 DeclareParamOps); 1603 InFlag = Chain.getValue(1); 1604 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1605 EVT elemtype = VTs[j]; 1606 int curOffset = Offsets[j]; 1607 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); 1608 auto PtrVT = getPointerTy(DL); 1609 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], 1610 DAG.getConstant(curOffset, dl, PtrVT)); 1611 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, 1612 MachinePointerInfo(), PartAlign); 1613 if (elemtype.getSizeInBits() < 16) { 1614 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); 1615 } 1616 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1617 SDValue CopyParamOps[] = { Chain, 1618 DAG.getConstant(paramCount, dl, MVT::i32), 1619 DAG.getConstant(curOffset, dl, MVT::i32), 1620 theVal, InFlag }; 1621 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, 1622 CopyParamOps, elemtype, 1623 MachinePointerInfo(), /* Align */ 0, 1624 MachineMemOperand::MOStore); 1625 1626 InFlag = Chain.getValue(1); 1627 } 1628 ++paramCount; 1629 } 1630 1631 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1632 unsigned retAlignment = 0; 1633 1634 // Handle Result 1635 if (Ins.size() > 0) { 1636 SmallVector<EVT, 16> resvtparts; 1637 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1638 1639 // Declare 1640 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1641 // .param .b<size-in-bits> retval0 1642 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1643 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1644 // these three types to match the logic in 1645 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1646 // Plus, this behavior is consistent with nvcc's. 1647 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1648 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1649 // Scalar needs to be at least 32bit wide 1650 if (resultsz < 32) 1651 resultsz = 32; 1652 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1653 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1654 DAG.getConstant(resultsz, dl, MVT::i32), 1655 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1656 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1657 DeclareRetOps); 1658 InFlag = Chain.getValue(1); 1659 } else { 1660 retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); 1661 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1662 SDValue DeclareRetOps[] = { Chain, 1663 DAG.getConstant(retAlignment, dl, MVT::i32), 1664 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1665 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1666 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1667 DeclareRetOps); 1668 InFlag = Chain.getValue(1); 1669 } 1670 } 1671 1672 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1673 // between them we must rely on the call site value which is valid for 1674 // indirect calls but is always null for libcalls. 1675 bool isIndirectCall = !Func && CS; 1676 1677 if (isa<ExternalSymbolSDNode>(Callee)) { 1678 Function* CalleeFunc = nullptr; 1679 1680 // Try to find the callee in the current module. 1681 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1682 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1683 1684 // Set the "libcall callee" attribute to indicate that the function 1685 // must always have a declaration. 1686 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1687 } 1688 1689 if (isIndirectCall) { 1690 // This is indirect function call case : PTX requires a prototype of the 1691 // form 1692 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1693 // to be emitted, and the label has to used as the last arg of call 1694 // instruction. 1695 // The prototype is embedded in a string and put as the operand for a 1696 // CallPrototype SDNode which will print out to the value of the string. 1697 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1698 std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); 1699 const char *ProtoStr = 1700 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1701 SDValue ProtoOps[] = { 1702 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1703 }; 1704 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1705 InFlag = Chain.getValue(1); 1706 } 1707 // Op to just print "call" 1708 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1709 SDValue PrintCallOps[] = { 1710 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1711 }; 1712 // We model convergent calls as separate opcodes. 1713 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1714 if (CLI.IsConvergent) 1715 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1716 : NVPTXISD::PrintConvergentCall; 1717 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1718 InFlag = Chain.getValue(1); 1719 1720 // Ops to print out the function name 1721 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1722 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1723 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1724 InFlag = Chain.getValue(1); 1725 1726 // Ops to print out the param list 1727 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1728 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1729 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1730 CallArgBeginOps); 1731 InFlag = Chain.getValue(1); 1732 1733 for (unsigned i = 0, e = paramCount; i != e; ++i) { 1734 unsigned opcode; 1735 if (i == (e - 1)) 1736 opcode = NVPTXISD::LastCallArg; 1737 else 1738 opcode = NVPTXISD::CallArg; 1739 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1740 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1741 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1742 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1743 InFlag = Chain.getValue(1); 1744 } 1745 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1746 SDValue CallArgEndOps[] = { Chain, 1747 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1748 InFlag }; 1749 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1750 InFlag = Chain.getValue(1); 1751 1752 if (isIndirectCall) { 1753 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1754 SDValue PrototypeOps[] = { Chain, 1755 DAG.getConstant(uniqueCallSite, dl, MVT::i32), 1756 InFlag }; 1757 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1758 InFlag = Chain.getValue(1); 1759 } 1760 1761 SmallVector<SDValue, 16> ProxyRegOps; 1762 SmallVector<Optional<MVT>, 16> ProxyRegTruncates; 1763 1764 // Generate loads from param memory/moves from registers for result 1765 if (Ins.size() > 0) { 1766 SmallVector<EVT, 16> VTs; 1767 SmallVector<uint64_t, 16> Offsets; 1768 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1769 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1770 1771 unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); 1772 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1773 1774 SmallVector<EVT, 6> LoadVTs; 1775 int VecIdx = -1; // Index of the first element of the vector. 1776 1777 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1778 // 32-bits are sign extended or zero extended, depending on whether 1779 // they are signed or unsigned types. 1780 bool ExtendIntegerRetVal = 1781 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1782 1783 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1784 bool needTruncate = false; 1785 EVT TheLoadType = VTs[i]; 1786 EVT EltType = Ins[i].VT; 1787 unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); 1788 if (ExtendIntegerRetVal) { 1789 TheLoadType = MVT::i32; 1790 EltType = MVT::i32; 1791 needTruncate = true; 1792 } else if (TheLoadType.getSizeInBits() < 16) { 1793 if (VTs[i].isInteger()) 1794 needTruncate = true; 1795 EltType = MVT::i16; 1796 } 1797 1798 // Record index of the very first element of the vector. 1799 if (VectorInfo[i] & PVF_FIRST) { 1800 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1801 VecIdx = i; 1802 } 1803 1804 LoadVTs.push_back(EltType); 1805 1806 if (VectorInfo[i] & PVF_LAST) { 1807 unsigned NumElts = LoadVTs.size(); 1808 LoadVTs.push_back(MVT::Other); 1809 LoadVTs.push_back(MVT::Glue); 1810 NVPTXISD::NodeType Op; 1811 switch (NumElts) { 1812 case 1: 1813 Op = NVPTXISD::LoadParam; 1814 break; 1815 case 2: 1816 Op = NVPTXISD::LoadParamV2; 1817 break; 1818 case 4: 1819 Op = NVPTXISD::LoadParamV4; 1820 break; 1821 default: 1822 llvm_unreachable("Invalid vector info."); 1823 } 1824 1825 SDValue LoadOperands[] = { 1826 Chain, DAG.getConstant(1, dl, MVT::i32), 1827 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1828 SDValue RetVal = DAG.getMemIntrinsicNode( 1829 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1830 MachinePointerInfo(), EltAlign, 1831 MachineMemOperand::MOLoad); 1832 1833 for (unsigned j = 0; j < NumElts; ++j) { 1834 ProxyRegOps.push_back(RetVal.getValue(j)); 1835 1836 if (needTruncate) 1837 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT)); 1838 else 1839 ProxyRegTruncates.push_back(Optional<MVT>()); 1840 } 1841 1842 Chain = RetVal.getValue(NumElts); 1843 InFlag = RetVal.getValue(NumElts + 1); 1844 1845 // Cleanup 1846 VecIdx = -1; 1847 LoadVTs.clear(); 1848 } 1849 } 1850 } 1851 1852 Chain = DAG.getCALLSEQ_END(Chain, 1853 DAG.getIntPtrConstant(uniqueCallSite, dl, true), 1854 DAG.getIntPtrConstant(uniqueCallSite + 1, dl, 1855 true), 1856 InFlag, dl); 1857 InFlag = Chain.getValue(1); 1858 uniqueCallSite++; 1859 1860 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1861 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1862 // dangling. 1863 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1864 SDValue Ret = DAG.getNode( 1865 NVPTXISD::ProxyReg, dl, 1866 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1867 { Chain, ProxyRegOps[i], InFlag } 1868 ); 1869 1870 Chain = Ret.getValue(1); 1871 InFlag = Ret.getValue(2); 1872 1873 if (ProxyRegTruncates[i].hasValue()) { 1874 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); 1875 } 1876 1877 InVals.push_back(Ret); 1878 } 1879 1880 // set isTailCall to false for now, until we figure out how to express 1881 // tail call optimization in PTX 1882 isTailCall = false; 1883 return Chain; 1884 } 1885 1886 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1887 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1888 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1889 SDValue 1890 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1891 SDNode *Node = Op.getNode(); 1892 SDLoc dl(Node); 1893 SmallVector<SDValue, 8> Ops; 1894 unsigned NumOperands = Node->getNumOperands(); 1895 for (unsigned i = 0; i < NumOperands; ++i) { 1896 SDValue SubOp = Node->getOperand(i); 1897 EVT VVT = SubOp.getNode()->getValueType(0); 1898 EVT EltVT = VVT.getVectorElementType(); 1899 unsigned NumSubElem = VVT.getVectorNumElements(); 1900 for (unsigned j = 0; j < NumSubElem; ++j) { 1901 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1902 DAG.getIntPtrConstant(j, dl))); 1903 } 1904 } 1905 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 1906 } 1907 1908 // We can init constant f16x2 with a single .b32 move. Normally it 1909 // would get lowered as two constant loads and vector-packing move. 1910 // mov.b16 %h1, 0x4000; 1911 // mov.b16 %h2, 0x3C00; 1912 // mov.b32 %hh2, {%h2, %h1}; 1913 // Instead we want just a constant move: 1914 // mov.b32 %hh2, 0x40003C00 1915 // 1916 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 1917 // generates good SASS in both cases. 1918 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 1919 SelectionDAG &DAG) const { 1920 //return Op; 1921 if (!(Op->getValueType(0) == MVT::v2f16 && 1922 isa<ConstantFPSDNode>(Op->getOperand(0)) && 1923 isa<ConstantFPSDNode>(Op->getOperand(1)))) 1924 return Op; 1925 1926 APInt E0 = 1927 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 1928 APInt E1 = 1929 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 1930 SDValue Const = 1931 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 1932 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 1933 } 1934 1935 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 1936 SelectionDAG &DAG) const { 1937 SDValue Index = Op->getOperand(1); 1938 // Constant index will be matched by tablegen. 1939 if (isa<ConstantSDNode>(Index.getNode())) 1940 return Op; 1941 1942 // Extract individual elements and select one of them. 1943 SDValue Vector = Op->getOperand(0); 1944 EVT VectorVT = Vector.getValueType(); 1945 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 1946 EVT EltVT = VectorVT.getVectorElementType(); 1947 1948 SDLoc dl(Op.getNode()); 1949 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1950 DAG.getIntPtrConstant(0, dl)); 1951 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1952 DAG.getIntPtrConstant(1, dl)); 1953 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 1954 ISD::CondCode::SETEQ); 1955 } 1956 1957 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1958 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1959 /// amount, or 1960 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1961 /// amount. 1962 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 1963 SelectionDAG &DAG) const { 1964 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1965 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 1966 1967 EVT VT = Op.getValueType(); 1968 unsigned VTBits = VT.getSizeInBits(); 1969 SDLoc dl(Op); 1970 SDValue ShOpLo = Op.getOperand(0); 1971 SDValue ShOpHi = Op.getOperand(1); 1972 SDValue ShAmt = Op.getOperand(2); 1973 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 1974 1975 if (VTBits == 32 && STI.getSmVersion() >= 35) { 1976 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1977 // {dHi, dLo} = {aHi, aLo} >> Amt 1978 // dHi = aHi >> Amt 1979 // dLo = shf.r.clamp aLo, aHi, Amt 1980 1981 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1982 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 1983 ShAmt); 1984 1985 SDValue Ops[2] = { Lo, Hi }; 1986 return DAG.getMergeValues(Ops, dl); 1987 } 1988 else { 1989 // {dHi, dLo} = {aHi, aLo} >> Amt 1990 // - if (Amt>=size) then 1991 // dLo = aHi >> (Amt-size) 1992 // dHi = aHi >> Amt (this is either all 0 or all 1) 1993 // else 1994 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 1995 // dHi = aHi >> Amt 1996 1997 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1998 DAG.getConstant(VTBits, dl, MVT::i32), 1999 ShAmt); 2000 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2001 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2002 DAG.getConstant(VTBits, dl, MVT::i32)); 2003 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2004 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2005 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2006 2007 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2008 DAG.getConstant(VTBits, dl, MVT::i32), 2009 ISD::SETGE); 2010 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2011 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2012 2013 SDValue Ops[2] = { Lo, Hi }; 2014 return DAG.getMergeValues(Ops, dl); 2015 } 2016 } 2017 2018 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2019 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2020 /// amount, or 2021 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2022 /// amount. 2023 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2024 SelectionDAG &DAG) const { 2025 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2026 assert(Op.getOpcode() == ISD::SHL_PARTS); 2027 2028 EVT VT = Op.getValueType(); 2029 unsigned VTBits = VT.getSizeInBits(); 2030 SDLoc dl(Op); 2031 SDValue ShOpLo = Op.getOperand(0); 2032 SDValue ShOpHi = Op.getOperand(1); 2033 SDValue ShAmt = Op.getOperand(2); 2034 2035 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2036 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2037 // {dHi, dLo} = {aHi, aLo} << Amt 2038 // dHi = shf.l.clamp aLo, aHi, Amt 2039 // dLo = aLo << Amt 2040 2041 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2042 ShAmt); 2043 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2044 2045 SDValue Ops[2] = { Lo, Hi }; 2046 return DAG.getMergeValues(Ops, dl); 2047 } 2048 else { 2049 // {dHi, dLo} = {aHi, aLo} << Amt 2050 // - if (Amt>=size) then 2051 // dLo = aLo << Amt (all 0) 2052 // dLo = aLo << (Amt-size) 2053 // else 2054 // dLo = aLo << Amt 2055 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2056 2057 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2058 DAG.getConstant(VTBits, dl, MVT::i32), 2059 ShAmt); 2060 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2061 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2062 DAG.getConstant(VTBits, dl, MVT::i32)); 2063 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2064 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2065 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2066 2067 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2068 DAG.getConstant(VTBits, dl, MVT::i32), 2069 ISD::SETGE); 2070 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2071 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2072 2073 SDValue Ops[2] = { Lo, Hi }; 2074 return DAG.getMergeValues(Ops, dl); 2075 } 2076 } 2077 2078 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2079 EVT VT = Op.getValueType(); 2080 2081 if (VT == MVT::f32) 2082 return LowerFROUND32(Op, DAG); 2083 2084 if (VT == MVT::f64) 2085 return LowerFROUND64(Op, DAG); 2086 2087 llvm_unreachable("unhandled type"); 2088 } 2089 2090 // This is the the rounding method used in CUDA libdevice in C like code: 2091 // float roundf(float A) 2092 // { 2093 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2094 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2095 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2096 // } 2097 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2098 SelectionDAG &DAG) const { 2099 SDLoc SL(Op); 2100 SDValue A = Op.getOperand(0); 2101 EVT VT = Op.getValueType(); 2102 2103 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2104 2105 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2106 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2107 const int SignBitMask = 0x80000000; 2108 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2109 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2110 const int PointFiveInBits = 0x3F000000; 2111 SDValue PointFiveWithSignRaw = 2112 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2113 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2114 SDValue PointFiveWithSign = 2115 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2116 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2117 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2118 2119 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2120 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2121 SDValue IsLarge = 2122 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2123 ISD::SETOGT); 2124 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2125 2126 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2127 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2128 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2129 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2130 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2131 } 2132 2133 // The implementation of round(double) is similar to that of round(float) in 2134 // that they both separate the value range into three regions and use a method 2135 // specific to the region to round the values. However, round(double) first 2136 // calculates the round of the absolute value and then adds the sign back while 2137 // round(float) directly rounds the value with sign. 2138 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2139 SelectionDAG &DAG) const { 2140 SDLoc SL(Op); 2141 SDValue A = Op.getOperand(0); 2142 EVT VT = Op.getValueType(); 2143 2144 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2145 2146 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2147 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2148 DAG.getConstantFP(0.5, SL, VT)); 2149 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2150 2151 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2152 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2153 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2154 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2155 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2156 DAG.getConstantFP(0, SL, VT), 2157 RoundedA); 2158 2159 // Add sign to rounded_A 2160 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2161 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2162 2163 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2164 SDValue IsLarge = 2165 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2166 ISD::SETOGT); 2167 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2168 } 2169 2170 2171 2172 SDValue 2173 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2174 switch (Op.getOpcode()) { 2175 case ISD::RETURNADDR: 2176 return SDValue(); 2177 case ISD::FRAMEADDR: 2178 return SDValue(); 2179 case ISD::GlobalAddress: 2180 return LowerGlobalAddress(Op, DAG); 2181 case ISD::INTRINSIC_W_CHAIN: 2182 return Op; 2183 case ISD::BUILD_VECTOR: 2184 return LowerBUILD_VECTOR(Op, DAG); 2185 case ISD::EXTRACT_SUBVECTOR: 2186 return Op; 2187 case ISD::EXTRACT_VECTOR_ELT: 2188 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2189 case ISD::CONCAT_VECTORS: 2190 return LowerCONCAT_VECTORS(Op, DAG); 2191 case ISD::STORE: 2192 return LowerSTORE(Op, DAG); 2193 case ISD::LOAD: 2194 return LowerLOAD(Op, DAG); 2195 case ISD::SHL_PARTS: 2196 return LowerShiftLeftParts(Op, DAG); 2197 case ISD::SRA_PARTS: 2198 case ISD::SRL_PARTS: 2199 return LowerShiftRightParts(Op, DAG); 2200 case ISD::SELECT: 2201 return LowerSelect(Op, DAG); 2202 case ISD::FROUND: 2203 return LowerFROUND(Op, DAG); 2204 default: 2205 llvm_unreachable("Custom lowering not defined for operation"); 2206 } 2207 } 2208 2209 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2210 SDValue Op0 = Op->getOperand(0); 2211 SDValue Op1 = Op->getOperand(1); 2212 SDValue Op2 = Op->getOperand(2); 2213 SDLoc DL(Op.getNode()); 2214 2215 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2216 2217 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2218 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2219 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2220 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2221 2222 return Trunc; 2223 } 2224 2225 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2226 if (Op.getValueType() == MVT::i1) 2227 return LowerLOADi1(Op, DAG); 2228 2229 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2230 // loads and have to handle it here. 2231 if (Op.getValueType() == MVT::v2f16) { 2232 LoadSDNode *Load = cast<LoadSDNode>(Op); 2233 EVT MemVT = Load->getMemoryVT(); 2234 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2235 MemVT, *Load->getMemOperand())) { 2236 SDValue Ops[2]; 2237 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2238 return DAG.getMergeValues(Ops, SDLoc(Op)); 2239 } 2240 } 2241 2242 return SDValue(); 2243 } 2244 2245 // v = ld i1* addr 2246 // => 2247 // v1 = ld i8* addr (-> i16) 2248 // v = trunc i16 to i1 2249 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2250 SDNode *Node = Op.getNode(); 2251 LoadSDNode *LD = cast<LoadSDNode>(Node); 2252 SDLoc dl(Node); 2253 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2254 assert(Node->getValueType(0) == MVT::i1 && 2255 "Custom lowering for i1 load only"); 2256 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2257 LD->getPointerInfo(), LD->getAlignment(), 2258 LD->getMemOperand()->getFlags()); 2259 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2260 // The legalizer (the caller) is expecting two values from the legalized 2261 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2262 // in LegalizeDAG.cpp which also uses MergeValues. 2263 SDValue Ops[] = { result, LD->getChain() }; 2264 return DAG.getMergeValues(Ops, dl); 2265 } 2266 2267 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2268 StoreSDNode *Store = cast<StoreSDNode>(Op); 2269 EVT VT = Store->getMemoryVT(); 2270 2271 if (VT == MVT::i1) 2272 return LowerSTOREi1(Op, DAG); 2273 2274 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2275 // stores and have to handle it here. 2276 if (VT == MVT::v2f16 && 2277 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2278 VT, *Store->getMemOperand())) 2279 return expandUnalignedStore(Store, DAG); 2280 2281 if (VT.isVector()) 2282 return LowerSTOREVector(Op, DAG); 2283 2284 return SDValue(); 2285 } 2286 2287 SDValue 2288 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2289 SDNode *N = Op.getNode(); 2290 SDValue Val = N->getOperand(1); 2291 SDLoc DL(N); 2292 EVT ValVT = Val.getValueType(); 2293 2294 if (ValVT.isVector()) { 2295 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2296 // legal. We can (and should) split that into 2 stores of <2 x double> here 2297 // but I'm leaving that as a TODO for now. 2298 if (!ValVT.isSimple()) 2299 return SDValue(); 2300 switch (ValVT.getSimpleVT().SimpleTy) { 2301 default: 2302 return SDValue(); 2303 case MVT::v2i8: 2304 case MVT::v2i16: 2305 case MVT::v2i32: 2306 case MVT::v2i64: 2307 case MVT::v2f16: 2308 case MVT::v2f32: 2309 case MVT::v2f64: 2310 case MVT::v4i8: 2311 case MVT::v4i16: 2312 case MVT::v4i32: 2313 case MVT::v4f16: 2314 case MVT::v4f32: 2315 case MVT::v8f16: // <4 x f16x2> 2316 // This is a "native" vector type 2317 break; 2318 } 2319 2320 MemSDNode *MemSD = cast<MemSDNode>(N); 2321 const DataLayout &TD = DAG.getDataLayout(); 2322 2323 unsigned Align = MemSD->getAlignment(); 2324 unsigned PrefAlign = 2325 TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); 2326 if (Align < PrefAlign) { 2327 // This store is not sufficiently aligned, so bail out and let this vector 2328 // store be scalarized. Note that we may still be able to emit smaller 2329 // vector stores. For example, if we are storing a <4 x float> with an 2330 // alignment of 8, this check will fail but the legalizer will try again 2331 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2332 return SDValue(); 2333 } 2334 2335 unsigned Opcode = 0; 2336 EVT EltVT = ValVT.getVectorElementType(); 2337 unsigned NumElts = ValVT.getVectorNumElements(); 2338 2339 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2340 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2341 // stored type to i16 and propagate the "real" type as the memory type. 2342 bool NeedExt = false; 2343 if (EltVT.getSizeInBits() < 16) 2344 NeedExt = true; 2345 2346 bool StoreF16x2 = false; 2347 switch (NumElts) { 2348 default: 2349 return SDValue(); 2350 case 2: 2351 Opcode = NVPTXISD::StoreV2; 2352 break; 2353 case 4: 2354 Opcode = NVPTXISD::StoreV4; 2355 break; 2356 case 8: 2357 // v8f16 is a special case. PTX doesn't have st.v8.f16 2358 // instruction. Instead, we split the vector into v2f16 chunks and 2359 // store them with st.v4.b32. 2360 assert(EltVT == MVT::f16 && "Wrong type for the vector."); 2361 Opcode = NVPTXISD::StoreV4; 2362 StoreF16x2 = true; 2363 break; 2364 } 2365 2366 SmallVector<SDValue, 8> Ops; 2367 2368 // First is the chain 2369 Ops.push_back(N->getOperand(0)); 2370 2371 if (StoreF16x2) { 2372 // Combine f16,f16 -> v2f16 2373 NumElts /= 2; 2374 for (unsigned i = 0; i < NumElts; ++i) { 2375 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2376 DAG.getIntPtrConstant(i * 2, DL)); 2377 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2378 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2379 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2380 Ops.push_back(V2); 2381 } 2382 } else { 2383 // Then the split values 2384 for (unsigned i = 0; i < NumElts; ++i) { 2385 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2386 DAG.getIntPtrConstant(i, DL)); 2387 if (NeedExt) 2388 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2389 Ops.push_back(ExtVal); 2390 } 2391 } 2392 2393 // Then any remaining arguments 2394 Ops.append(N->op_begin() + 2, N->op_end()); 2395 2396 SDValue NewSt = 2397 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2398 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2399 2400 // return DCI.CombineTo(N, NewSt, true); 2401 return NewSt; 2402 } 2403 2404 return SDValue(); 2405 } 2406 2407 // st i1 v, addr 2408 // => 2409 // v1 = zxt v to i16 2410 // st.u8 i16, addr 2411 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2412 SDNode *Node = Op.getNode(); 2413 SDLoc dl(Node); 2414 StoreSDNode *ST = cast<StoreSDNode>(Node); 2415 SDValue Tmp1 = ST->getChain(); 2416 SDValue Tmp2 = ST->getBasePtr(); 2417 SDValue Tmp3 = ST->getValue(); 2418 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2419 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2420 SDValue Result = 2421 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2422 ST->getAlignment(), ST->getMemOperand()->getFlags()); 2423 return Result; 2424 } 2425 2426 SDValue 2427 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 2428 std::string ParamSym; 2429 raw_string_ostream ParamStr(ParamSym); 2430 2431 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 2432 ParamStr.flush(); 2433 2434 std::string *SavedStr = 2435 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 2436 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 2437 } 2438 2439 // Check to see if the kernel argument is image*_t or sampler_t 2440 2441 static bool isImageOrSamplerVal(const Value *arg, const Module *context) { 2442 static const char *const specialTypes[] = { "struct._image2d_t", 2443 "struct._image3d_t", 2444 "struct._sampler_t" }; 2445 2446 Type *Ty = arg->getType(); 2447 auto *PTy = dyn_cast<PointerType>(Ty); 2448 2449 if (!PTy) 2450 return false; 2451 2452 if (!context) 2453 return false; 2454 2455 auto *STy = dyn_cast<StructType>(PTy->getElementType()); 2456 if (!STy || STy->isLiteral()) 2457 return false; 2458 2459 return std::find(std::begin(specialTypes), std::end(specialTypes), 2460 STy->getName()) != std::end(specialTypes); 2461 } 2462 2463 SDValue NVPTXTargetLowering::LowerFormalArguments( 2464 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2465 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2466 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2467 MachineFunction &MF = DAG.getMachineFunction(); 2468 const DataLayout &DL = DAG.getDataLayout(); 2469 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2470 2471 const Function *F = &MF.getFunction(); 2472 const AttributeList &PAL = F->getAttributes(); 2473 const TargetLowering *TLI = STI.getTargetLowering(); 2474 2475 SDValue Root = DAG.getRoot(); 2476 std::vector<SDValue> OutChains; 2477 2478 bool isABI = (STI.getSmVersion() >= 20); 2479 assert(isABI && "Non-ABI compilation is not supported"); 2480 if (!isABI) 2481 return Chain; 2482 2483 std::vector<Type *> argTypes; 2484 std::vector<const Argument *> theArgs; 2485 for (const Argument &I : F->args()) { 2486 theArgs.push_back(&I); 2487 argTypes.push_back(I.getType()); 2488 } 2489 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2490 // Ins.size() will be larger 2491 // * if there is an aggregate argument with multiple fields (each field 2492 // showing up separately in Ins) 2493 // * if there is a vector argument with more than typical vector-length 2494 // elements (generally if more than 4) where each vector element is 2495 // individually present in Ins. 2496 // So a different index should be used for indexing into Ins. 2497 // See similar issue in LowerCall. 2498 unsigned InsIdx = 0; 2499 2500 int idx = 0; 2501 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2502 Type *Ty = argTypes[i]; 2503 2504 // If the kernel argument is image*_t or sampler_t, convert it to 2505 // a i32 constant holding the parameter position. This can later 2506 // matched in the AsmPrinter to output the correct mangled name. 2507 if (isImageOrSamplerVal( 2508 theArgs[i], 2509 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() 2510 : nullptr))) { 2511 assert(isKernelFunction(*F) && 2512 "Only kernels can have image/sampler params"); 2513 InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); 2514 continue; 2515 } 2516 2517 if (theArgs[i]->use_empty()) { 2518 // argument is dead 2519 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2520 SmallVector<EVT, 16> vtparts; 2521 2522 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2523 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2524 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2525 ++parti) { 2526 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2527 ++InsIdx; 2528 } 2529 if (vtparts.size() > 0) 2530 --InsIdx; 2531 continue; 2532 } 2533 if (Ty->isVectorTy()) { 2534 EVT ObjectVT = getValueType(DL, Ty); 2535 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2536 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2537 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2538 ++InsIdx; 2539 } 2540 if (NumRegs > 0) 2541 --InsIdx; 2542 continue; 2543 } 2544 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2545 continue; 2546 } 2547 2548 // In the following cases, assign a node order of "idx+1" 2549 // to newly created nodes. The SDNodes for params have to 2550 // appear in the same order as their order of appearance 2551 // in the original function. "idx+1" holds that order. 2552 if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { 2553 bool aggregateIsPacked = false; 2554 if (StructType *STy = dyn_cast<StructType>(Ty)) 2555 aggregateIsPacked = STy->isPacked(); 2556 2557 SmallVector<EVT, 16> VTs; 2558 SmallVector<uint64_t, 16> Offsets; 2559 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2560 assert(VTs.size() > 0 && "Unexpected empty type."); 2561 auto VectorInfo = 2562 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); 2563 2564 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2565 int VecIdx = -1; // Index of the first element of the current vector. 2566 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2567 if (VectorInfo[parti] & PVF_FIRST) { 2568 assert(VecIdx == -1 && "Orphaned vector."); 2569 VecIdx = parti; 2570 } 2571 2572 // That's the last element of this store op. 2573 if (VectorInfo[parti] & PVF_LAST) { 2574 unsigned NumElts = parti - VecIdx + 1; 2575 EVT EltVT = VTs[parti]; 2576 // i1 is loaded/stored as i8. 2577 EVT LoadVT = EltVT; 2578 if (EltVT == MVT::i1) 2579 LoadVT = MVT::i8; 2580 else if (EltVT == MVT::v2f16) 2581 // getLoad needs a vector type, but it can't handle 2582 // vectors which contain v2f16 elements. So we must load 2583 // using i32 here and then bitcast back. 2584 LoadVT = MVT::i32; 2585 2586 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2587 SDValue VecAddr = 2588 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2589 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2590 Value *srcValue = Constant::getNullValue(PointerType::get( 2591 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2592 SDValue P = 2593 DAG.getLoad(VecVT, dl, Root, VecAddr, 2594 MachinePointerInfo(srcValue), aggregateIsPacked, 2595 MachineMemOperand::MODereferenceable | 2596 MachineMemOperand::MOInvariant); 2597 if (P.getNode()) 2598 P.getNode()->setIROrder(idx + 1); 2599 for (unsigned j = 0; j < NumElts; ++j) { 2600 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2601 DAG.getIntPtrConstant(j, dl)); 2602 // We've loaded i1 as an i8 and now must truncate it back to i1 2603 if (EltVT == MVT::i1) 2604 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2605 // v2f16 was loaded as an i32. Now we must bitcast it back. 2606 else if (EltVT == MVT::v2f16) 2607 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2608 // Extend the element if necessary (e.g. an i8 is loaded 2609 // into an i16 register) 2610 if (Ins[InsIdx].VT.isInteger() && 2611 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { 2612 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2613 : ISD::ZERO_EXTEND; 2614 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2615 } 2616 InVals.push_back(Elt); 2617 } 2618 2619 // Reset vector tracking state. 2620 VecIdx = -1; 2621 } 2622 ++InsIdx; 2623 } 2624 if (VTs.size() > 0) 2625 --InsIdx; 2626 continue; 2627 } 2628 2629 // Param has ByVal attribute 2630 // Return MoveParam(param symbol). 2631 // Ideally, the param symbol can be returned directly, 2632 // but when SDNode builder decides to use it in a CopyToReg(), 2633 // machine instruction fails because TargetExternalSymbol 2634 // (not lowered) is target dependent, and CopyToReg assumes 2635 // the source is lowered. 2636 EVT ObjectVT = getValueType(DL, Ty); 2637 assert(ObjectVT == Ins[InsIdx].VT && 2638 "Ins type did not match function type"); 2639 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2640 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2641 if (p.getNode()) 2642 p.getNode()->setIROrder(idx + 1); 2643 InVals.push_back(p); 2644 } 2645 2646 // Clang will check explicit VarArg and issue error if any. However, Clang 2647 // will let code with 2648 // implicit var arg like f() pass. See bug 617733. 2649 // We treat this case as if the arg list is empty. 2650 // if (F.isVarArg()) { 2651 // assert(0 && "VarArg not supported yet!"); 2652 //} 2653 2654 if (!OutChains.empty()) 2655 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2656 2657 return Chain; 2658 } 2659 2660 SDValue 2661 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2662 bool isVarArg, 2663 const SmallVectorImpl<ISD::OutputArg> &Outs, 2664 const SmallVectorImpl<SDValue> &OutVals, 2665 const SDLoc &dl, SelectionDAG &DAG) const { 2666 MachineFunction &MF = DAG.getMachineFunction(); 2667 Type *RetTy = MF.getFunction().getReturnType(); 2668 2669 bool isABI = (STI.getSmVersion() >= 20); 2670 assert(isABI && "Non-ABI compilation is not supported"); 2671 if (!isABI) 2672 return Chain; 2673 2674 const DataLayout DL = DAG.getDataLayout(); 2675 SmallVector<EVT, 16> VTs; 2676 SmallVector<uint64_t, 16> Offsets; 2677 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2678 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2679 2680 auto VectorInfo = VectorizePTXValueVTs( 2681 VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); 2682 2683 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2684 // 32-bits are sign extended or zero extended, depending on whether 2685 // they are signed or unsigned types. 2686 bool ExtendIntegerRetVal = 2687 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2688 2689 SmallVector<SDValue, 6> StoreOperands; 2690 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2691 // New load/store. Record chain and offset operands. 2692 if (VectorInfo[i] & PVF_FIRST) { 2693 assert(StoreOperands.empty() && "Orphaned operand list."); 2694 StoreOperands.push_back(Chain); 2695 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2696 } 2697 2698 SDValue RetVal = OutVals[i]; 2699 if (ExtendIntegerRetVal) { 2700 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2701 : ISD::ZERO_EXTEND, 2702 dl, MVT::i32, RetVal); 2703 } else if (RetVal.getValueSizeInBits() < 16) { 2704 // Use 16-bit registers for small load-stores as it's the 2705 // smallest general purpose register size supported by NVPTX. 2706 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2707 } 2708 2709 // Record the value to return. 2710 StoreOperands.push_back(RetVal); 2711 2712 // That's the last element of this store op. 2713 if (VectorInfo[i] & PVF_LAST) { 2714 NVPTXISD::NodeType Op; 2715 unsigned NumElts = StoreOperands.size() - 2; 2716 switch (NumElts) { 2717 case 1: 2718 Op = NVPTXISD::StoreRetval; 2719 break; 2720 case 2: 2721 Op = NVPTXISD::StoreRetvalV2; 2722 break; 2723 case 4: 2724 Op = NVPTXISD::StoreRetvalV4; 2725 break; 2726 default: 2727 llvm_unreachable("Invalid vector info."); 2728 } 2729 2730 // Adjust type of load/store op if we've extended the scalar 2731 // return value. 2732 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2733 Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), 2734 StoreOperands, TheStoreType, 2735 MachinePointerInfo(), /* Align */ 1, 2736 MachineMemOperand::MOStore); 2737 // Cleanup vector state. 2738 StoreOperands.clear(); 2739 } 2740 } 2741 2742 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2743 } 2744 2745 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2746 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2747 SelectionDAG &DAG) const { 2748 if (Constraint.length() > 1) 2749 return; 2750 else 2751 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2752 } 2753 2754 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2755 switch (Intrinsic) { 2756 default: 2757 return 0; 2758 2759 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2760 return NVPTXISD::Tex1DFloatS32; 2761 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2762 return NVPTXISD::Tex1DFloatFloat; 2763 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2764 return NVPTXISD::Tex1DFloatFloatLevel; 2765 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2766 return NVPTXISD::Tex1DFloatFloatGrad; 2767 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2768 return NVPTXISD::Tex1DS32S32; 2769 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2770 return NVPTXISD::Tex1DS32Float; 2771 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2772 return NVPTXISD::Tex1DS32FloatLevel; 2773 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2774 return NVPTXISD::Tex1DS32FloatGrad; 2775 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2776 return NVPTXISD::Tex1DU32S32; 2777 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2778 return NVPTXISD::Tex1DU32Float; 2779 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2780 return NVPTXISD::Tex1DU32FloatLevel; 2781 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2782 return NVPTXISD::Tex1DU32FloatGrad; 2783 2784 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2785 return NVPTXISD::Tex1DArrayFloatS32; 2786 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2787 return NVPTXISD::Tex1DArrayFloatFloat; 2788 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2789 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2790 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2791 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2792 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2793 return NVPTXISD::Tex1DArrayS32S32; 2794 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2795 return NVPTXISD::Tex1DArrayS32Float; 2796 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2797 return NVPTXISD::Tex1DArrayS32FloatLevel; 2798 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2799 return NVPTXISD::Tex1DArrayS32FloatGrad; 2800 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2801 return NVPTXISD::Tex1DArrayU32S32; 2802 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2803 return NVPTXISD::Tex1DArrayU32Float; 2804 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2805 return NVPTXISD::Tex1DArrayU32FloatLevel; 2806 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2807 return NVPTXISD::Tex1DArrayU32FloatGrad; 2808 2809 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2810 return NVPTXISD::Tex2DFloatS32; 2811 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2812 return NVPTXISD::Tex2DFloatFloat; 2813 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2814 return NVPTXISD::Tex2DFloatFloatLevel; 2815 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2816 return NVPTXISD::Tex2DFloatFloatGrad; 2817 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2818 return NVPTXISD::Tex2DS32S32; 2819 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2820 return NVPTXISD::Tex2DS32Float; 2821 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2822 return NVPTXISD::Tex2DS32FloatLevel; 2823 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2824 return NVPTXISD::Tex2DS32FloatGrad; 2825 case Intrinsic::nvvm_tex_2d_v4u32_s32: 2826 return NVPTXISD::Tex2DU32S32; 2827 case Intrinsic::nvvm_tex_2d_v4u32_f32: 2828 return NVPTXISD::Tex2DU32Float; 2829 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 2830 return NVPTXISD::Tex2DU32FloatLevel; 2831 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 2832 return NVPTXISD::Tex2DU32FloatGrad; 2833 2834 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 2835 return NVPTXISD::Tex2DArrayFloatS32; 2836 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2837 return NVPTXISD::Tex2DArrayFloatFloat; 2838 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2839 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2840 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2841 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2842 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 2843 return NVPTXISD::Tex2DArrayS32S32; 2844 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 2845 return NVPTXISD::Tex2DArrayS32Float; 2846 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 2847 return NVPTXISD::Tex2DArrayS32FloatLevel; 2848 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 2849 return NVPTXISD::Tex2DArrayS32FloatGrad; 2850 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 2851 return NVPTXISD::Tex2DArrayU32S32; 2852 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 2853 return NVPTXISD::Tex2DArrayU32Float; 2854 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 2855 return NVPTXISD::Tex2DArrayU32FloatLevel; 2856 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 2857 return NVPTXISD::Tex2DArrayU32FloatGrad; 2858 2859 case Intrinsic::nvvm_tex_3d_v4f32_s32: 2860 return NVPTXISD::Tex3DFloatS32; 2861 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2862 return NVPTXISD::Tex3DFloatFloat; 2863 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2864 return NVPTXISD::Tex3DFloatFloatLevel; 2865 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2866 return NVPTXISD::Tex3DFloatFloatGrad; 2867 case Intrinsic::nvvm_tex_3d_v4s32_s32: 2868 return NVPTXISD::Tex3DS32S32; 2869 case Intrinsic::nvvm_tex_3d_v4s32_f32: 2870 return NVPTXISD::Tex3DS32Float; 2871 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 2872 return NVPTXISD::Tex3DS32FloatLevel; 2873 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 2874 return NVPTXISD::Tex3DS32FloatGrad; 2875 case Intrinsic::nvvm_tex_3d_v4u32_s32: 2876 return NVPTXISD::Tex3DU32S32; 2877 case Intrinsic::nvvm_tex_3d_v4u32_f32: 2878 return NVPTXISD::Tex3DU32Float; 2879 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 2880 return NVPTXISD::Tex3DU32FloatLevel; 2881 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 2882 return NVPTXISD::Tex3DU32FloatGrad; 2883 2884 case Intrinsic::nvvm_tex_cube_v4f32_f32: 2885 return NVPTXISD::TexCubeFloatFloat; 2886 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 2887 return NVPTXISD::TexCubeFloatFloatLevel; 2888 case Intrinsic::nvvm_tex_cube_v4s32_f32: 2889 return NVPTXISD::TexCubeS32Float; 2890 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 2891 return NVPTXISD::TexCubeS32FloatLevel; 2892 case Intrinsic::nvvm_tex_cube_v4u32_f32: 2893 return NVPTXISD::TexCubeU32Float; 2894 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 2895 return NVPTXISD::TexCubeU32FloatLevel; 2896 2897 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 2898 return NVPTXISD::TexCubeArrayFloatFloat; 2899 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 2900 return NVPTXISD::TexCubeArrayFloatFloatLevel; 2901 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 2902 return NVPTXISD::TexCubeArrayS32Float; 2903 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 2904 return NVPTXISD::TexCubeArrayS32FloatLevel; 2905 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 2906 return NVPTXISD::TexCubeArrayU32Float; 2907 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 2908 return NVPTXISD::TexCubeArrayU32FloatLevel; 2909 2910 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 2911 return NVPTXISD::Tld4R2DFloatFloat; 2912 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 2913 return NVPTXISD::Tld4G2DFloatFloat; 2914 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 2915 return NVPTXISD::Tld4B2DFloatFloat; 2916 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 2917 return NVPTXISD::Tld4A2DFloatFloat; 2918 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 2919 return NVPTXISD::Tld4R2DS64Float; 2920 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 2921 return NVPTXISD::Tld4G2DS64Float; 2922 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 2923 return NVPTXISD::Tld4B2DS64Float; 2924 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 2925 return NVPTXISD::Tld4A2DS64Float; 2926 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 2927 return NVPTXISD::Tld4R2DU64Float; 2928 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 2929 return NVPTXISD::Tld4G2DU64Float; 2930 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 2931 return NVPTXISD::Tld4B2DU64Float; 2932 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 2933 return NVPTXISD::Tld4A2DU64Float; 2934 2935 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 2936 return NVPTXISD::TexUnified1DFloatS32; 2937 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 2938 return NVPTXISD::TexUnified1DFloatFloat; 2939 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 2940 return NVPTXISD::TexUnified1DFloatFloatLevel; 2941 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 2942 return NVPTXISD::TexUnified1DFloatFloatGrad; 2943 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 2944 return NVPTXISD::TexUnified1DS32S32; 2945 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 2946 return NVPTXISD::TexUnified1DS32Float; 2947 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 2948 return NVPTXISD::TexUnified1DS32FloatLevel; 2949 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 2950 return NVPTXISD::TexUnified1DS32FloatGrad; 2951 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 2952 return NVPTXISD::TexUnified1DU32S32; 2953 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 2954 return NVPTXISD::TexUnified1DU32Float; 2955 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 2956 return NVPTXISD::TexUnified1DU32FloatLevel; 2957 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 2958 return NVPTXISD::TexUnified1DU32FloatGrad; 2959 2960 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 2961 return NVPTXISD::TexUnified1DArrayFloatS32; 2962 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 2963 return NVPTXISD::TexUnified1DArrayFloatFloat; 2964 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 2965 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 2966 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 2967 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 2968 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 2969 return NVPTXISD::TexUnified1DArrayS32S32; 2970 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 2971 return NVPTXISD::TexUnified1DArrayS32Float; 2972 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 2973 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 2974 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 2975 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 2976 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 2977 return NVPTXISD::TexUnified1DArrayU32S32; 2978 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 2979 return NVPTXISD::TexUnified1DArrayU32Float; 2980 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 2981 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 2982 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 2983 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 2984 2985 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 2986 return NVPTXISD::TexUnified2DFloatS32; 2987 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 2988 return NVPTXISD::TexUnified2DFloatFloat; 2989 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 2990 return NVPTXISD::TexUnified2DFloatFloatLevel; 2991 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 2992 return NVPTXISD::TexUnified2DFloatFloatGrad; 2993 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 2994 return NVPTXISD::TexUnified2DS32S32; 2995 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 2996 return NVPTXISD::TexUnified2DS32Float; 2997 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 2998 return NVPTXISD::TexUnified2DS32FloatLevel; 2999 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3000 return NVPTXISD::TexUnified2DS32FloatGrad; 3001 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3002 return NVPTXISD::TexUnified2DU32S32; 3003 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3004 return NVPTXISD::TexUnified2DU32Float; 3005 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3006 return NVPTXISD::TexUnified2DU32FloatLevel; 3007 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3008 return NVPTXISD::TexUnified2DU32FloatGrad; 3009 3010 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3011 return NVPTXISD::TexUnified2DArrayFloatS32; 3012 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3013 return NVPTXISD::TexUnified2DArrayFloatFloat; 3014 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3015 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3016 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3017 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3018 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3019 return NVPTXISD::TexUnified2DArrayS32S32; 3020 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3021 return NVPTXISD::TexUnified2DArrayS32Float; 3022 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3023 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3024 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3025 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3026 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3027 return NVPTXISD::TexUnified2DArrayU32S32; 3028 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3029 return NVPTXISD::TexUnified2DArrayU32Float; 3030 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3031 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3032 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3033 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3034 3035 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3036 return NVPTXISD::TexUnified3DFloatS32; 3037 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3038 return NVPTXISD::TexUnified3DFloatFloat; 3039 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3040 return NVPTXISD::TexUnified3DFloatFloatLevel; 3041 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3042 return NVPTXISD::TexUnified3DFloatFloatGrad; 3043 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3044 return NVPTXISD::TexUnified3DS32S32; 3045 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3046 return NVPTXISD::TexUnified3DS32Float; 3047 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3048 return NVPTXISD::TexUnified3DS32FloatLevel; 3049 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3050 return NVPTXISD::TexUnified3DS32FloatGrad; 3051 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3052 return NVPTXISD::TexUnified3DU32S32; 3053 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3054 return NVPTXISD::TexUnified3DU32Float; 3055 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3056 return NVPTXISD::TexUnified3DU32FloatLevel; 3057 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3058 return NVPTXISD::TexUnified3DU32FloatGrad; 3059 3060 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3061 return NVPTXISD::TexUnifiedCubeFloatFloat; 3062 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3063 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3064 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3065 return NVPTXISD::TexUnifiedCubeS32Float; 3066 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3067 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3068 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3069 return NVPTXISD::TexUnifiedCubeU32Float; 3070 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3071 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3072 3073 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3074 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3075 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3076 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3077 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3078 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3079 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3080 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3081 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3082 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3083 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3084 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3085 3086 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3087 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3088 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3089 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3090 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3091 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3092 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3093 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3094 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3095 return NVPTXISD::Tld4UnifiedR2DS64Float; 3096 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3097 return NVPTXISD::Tld4UnifiedG2DS64Float; 3098 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3099 return NVPTXISD::Tld4UnifiedB2DS64Float; 3100 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3101 return NVPTXISD::Tld4UnifiedA2DS64Float; 3102 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3103 return NVPTXISD::Tld4UnifiedR2DU64Float; 3104 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3105 return NVPTXISD::Tld4UnifiedG2DU64Float; 3106 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3107 return NVPTXISD::Tld4UnifiedB2DU64Float; 3108 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3109 return NVPTXISD::Tld4UnifiedA2DU64Float; 3110 } 3111 } 3112 3113 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3114 switch (Intrinsic) { 3115 default: 3116 return 0; 3117 case Intrinsic::nvvm_suld_1d_i8_clamp: 3118 return NVPTXISD::Suld1DI8Clamp; 3119 case Intrinsic::nvvm_suld_1d_i16_clamp: 3120 return NVPTXISD::Suld1DI16Clamp; 3121 case Intrinsic::nvvm_suld_1d_i32_clamp: 3122 return NVPTXISD::Suld1DI32Clamp; 3123 case Intrinsic::nvvm_suld_1d_i64_clamp: 3124 return NVPTXISD::Suld1DI64Clamp; 3125 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3126 return NVPTXISD::Suld1DV2I8Clamp; 3127 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3128 return NVPTXISD::Suld1DV2I16Clamp; 3129 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3130 return NVPTXISD::Suld1DV2I32Clamp; 3131 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3132 return NVPTXISD::Suld1DV2I64Clamp; 3133 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3134 return NVPTXISD::Suld1DV4I8Clamp; 3135 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3136 return NVPTXISD::Suld1DV4I16Clamp; 3137 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3138 return NVPTXISD::Suld1DV4I32Clamp; 3139 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3140 return NVPTXISD::Suld1DArrayI8Clamp; 3141 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3142 return NVPTXISD::Suld1DArrayI16Clamp; 3143 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3144 return NVPTXISD::Suld1DArrayI32Clamp; 3145 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3146 return NVPTXISD::Suld1DArrayI64Clamp; 3147 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3148 return NVPTXISD::Suld1DArrayV2I8Clamp; 3149 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3150 return NVPTXISD::Suld1DArrayV2I16Clamp; 3151 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3152 return NVPTXISD::Suld1DArrayV2I32Clamp; 3153 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3154 return NVPTXISD::Suld1DArrayV2I64Clamp; 3155 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3156 return NVPTXISD::Suld1DArrayV4I8Clamp; 3157 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3158 return NVPTXISD::Suld1DArrayV4I16Clamp; 3159 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3160 return NVPTXISD::Suld1DArrayV4I32Clamp; 3161 case Intrinsic::nvvm_suld_2d_i8_clamp: 3162 return NVPTXISD::Suld2DI8Clamp; 3163 case Intrinsic::nvvm_suld_2d_i16_clamp: 3164 return NVPTXISD::Suld2DI16Clamp; 3165 case Intrinsic::nvvm_suld_2d_i32_clamp: 3166 return NVPTXISD::Suld2DI32Clamp; 3167 case Intrinsic::nvvm_suld_2d_i64_clamp: 3168 return NVPTXISD::Suld2DI64Clamp; 3169 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3170 return NVPTXISD::Suld2DV2I8Clamp; 3171 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3172 return NVPTXISD::Suld2DV2I16Clamp; 3173 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3174 return NVPTXISD::Suld2DV2I32Clamp; 3175 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3176 return NVPTXISD::Suld2DV2I64Clamp; 3177 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3178 return NVPTXISD::Suld2DV4I8Clamp; 3179 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3180 return NVPTXISD::Suld2DV4I16Clamp; 3181 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3182 return NVPTXISD::Suld2DV4I32Clamp; 3183 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3184 return NVPTXISD::Suld2DArrayI8Clamp; 3185 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3186 return NVPTXISD::Suld2DArrayI16Clamp; 3187 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3188 return NVPTXISD::Suld2DArrayI32Clamp; 3189 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3190 return NVPTXISD::Suld2DArrayI64Clamp; 3191 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3192 return NVPTXISD::Suld2DArrayV2I8Clamp; 3193 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3194 return NVPTXISD::Suld2DArrayV2I16Clamp; 3195 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3196 return NVPTXISD::Suld2DArrayV2I32Clamp; 3197 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3198 return NVPTXISD::Suld2DArrayV2I64Clamp; 3199 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3200 return NVPTXISD::Suld2DArrayV4I8Clamp; 3201 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3202 return NVPTXISD::Suld2DArrayV4I16Clamp; 3203 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3204 return NVPTXISD::Suld2DArrayV4I32Clamp; 3205 case Intrinsic::nvvm_suld_3d_i8_clamp: 3206 return NVPTXISD::Suld3DI8Clamp; 3207 case Intrinsic::nvvm_suld_3d_i16_clamp: 3208 return NVPTXISD::Suld3DI16Clamp; 3209 case Intrinsic::nvvm_suld_3d_i32_clamp: 3210 return NVPTXISD::Suld3DI32Clamp; 3211 case Intrinsic::nvvm_suld_3d_i64_clamp: 3212 return NVPTXISD::Suld3DI64Clamp; 3213 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3214 return NVPTXISD::Suld3DV2I8Clamp; 3215 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3216 return NVPTXISD::Suld3DV2I16Clamp; 3217 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3218 return NVPTXISD::Suld3DV2I32Clamp; 3219 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3220 return NVPTXISD::Suld3DV2I64Clamp; 3221 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3222 return NVPTXISD::Suld3DV4I8Clamp; 3223 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3224 return NVPTXISD::Suld3DV4I16Clamp; 3225 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3226 return NVPTXISD::Suld3DV4I32Clamp; 3227 case Intrinsic::nvvm_suld_1d_i8_trap: 3228 return NVPTXISD::Suld1DI8Trap; 3229 case Intrinsic::nvvm_suld_1d_i16_trap: 3230 return NVPTXISD::Suld1DI16Trap; 3231 case Intrinsic::nvvm_suld_1d_i32_trap: 3232 return NVPTXISD::Suld1DI32Trap; 3233 case Intrinsic::nvvm_suld_1d_i64_trap: 3234 return NVPTXISD::Suld1DI64Trap; 3235 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3236 return NVPTXISD::Suld1DV2I8Trap; 3237 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3238 return NVPTXISD::Suld1DV2I16Trap; 3239 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3240 return NVPTXISD::Suld1DV2I32Trap; 3241 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3242 return NVPTXISD::Suld1DV2I64Trap; 3243 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3244 return NVPTXISD::Suld1DV4I8Trap; 3245 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3246 return NVPTXISD::Suld1DV4I16Trap; 3247 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3248 return NVPTXISD::Suld1DV4I32Trap; 3249 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3250 return NVPTXISD::Suld1DArrayI8Trap; 3251 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3252 return NVPTXISD::Suld1DArrayI16Trap; 3253 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3254 return NVPTXISD::Suld1DArrayI32Trap; 3255 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3256 return NVPTXISD::Suld1DArrayI64Trap; 3257 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3258 return NVPTXISD::Suld1DArrayV2I8Trap; 3259 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3260 return NVPTXISD::Suld1DArrayV2I16Trap; 3261 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3262 return NVPTXISD::Suld1DArrayV2I32Trap; 3263 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3264 return NVPTXISD::Suld1DArrayV2I64Trap; 3265 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3266 return NVPTXISD::Suld1DArrayV4I8Trap; 3267 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3268 return NVPTXISD::Suld1DArrayV4I16Trap; 3269 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3270 return NVPTXISD::Suld1DArrayV4I32Trap; 3271 case Intrinsic::nvvm_suld_2d_i8_trap: 3272 return NVPTXISD::Suld2DI8Trap; 3273 case Intrinsic::nvvm_suld_2d_i16_trap: 3274 return NVPTXISD::Suld2DI16Trap; 3275 case Intrinsic::nvvm_suld_2d_i32_trap: 3276 return NVPTXISD::Suld2DI32Trap; 3277 case Intrinsic::nvvm_suld_2d_i64_trap: 3278 return NVPTXISD::Suld2DI64Trap; 3279 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3280 return NVPTXISD::Suld2DV2I8Trap; 3281 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3282 return NVPTXISD::Suld2DV2I16Trap; 3283 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3284 return NVPTXISD::Suld2DV2I32Trap; 3285 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3286 return NVPTXISD::Suld2DV2I64Trap; 3287 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3288 return NVPTXISD::Suld2DV4I8Trap; 3289 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3290 return NVPTXISD::Suld2DV4I16Trap; 3291 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3292 return NVPTXISD::Suld2DV4I32Trap; 3293 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3294 return NVPTXISD::Suld2DArrayI8Trap; 3295 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3296 return NVPTXISD::Suld2DArrayI16Trap; 3297 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3298 return NVPTXISD::Suld2DArrayI32Trap; 3299 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3300 return NVPTXISD::Suld2DArrayI64Trap; 3301 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3302 return NVPTXISD::Suld2DArrayV2I8Trap; 3303 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3304 return NVPTXISD::Suld2DArrayV2I16Trap; 3305 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3306 return NVPTXISD::Suld2DArrayV2I32Trap; 3307 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3308 return NVPTXISD::Suld2DArrayV2I64Trap; 3309 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3310 return NVPTXISD::Suld2DArrayV4I8Trap; 3311 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3312 return NVPTXISD::Suld2DArrayV4I16Trap; 3313 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3314 return NVPTXISD::Suld2DArrayV4I32Trap; 3315 case Intrinsic::nvvm_suld_3d_i8_trap: 3316 return NVPTXISD::Suld3DI8Trap; 3317 case Intrinsic::nvvm_suld_3d_i16_trap: 3318 return NVPTXISD::Suld3DI16Trap; 3319 case Intrinsic::nvvm_suld_3d_i32_trap: 3320 return NVPTXISD::Suld3DI32Trap; 3321 case Intrinsic::nvvm_suld_3d_i64_trap: 3322 return NVPTXISD::Suld3DI64Trap; 3323 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3324 return NVPTXISD::Suld3DV2I8Trap; 3325 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3326 return NVPTXISD::Suld3DV2I16Trap; 3327 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3328 return NVPTXISD::Suld3DV2I32Trap; 3329 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3330 return NVPTXISD::Suld3DV2I64Trap; 3331 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3332 return NVPTXISD::Suld3DV4I8Trap; 3333 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3334 return NVPTXISD::Suld3DV4I16Trap; 3335 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3336 return NVPTXISD::Suld3DV4I32Trap; 3337 case Intrinsic::nvvm_suld_1d_i8_zero: 3338 return NVPTXISD::Suld1DI8Zero; 3339 case Intrinsic::nvvm_suld_1d_i16_zero: 3340 return NVPTXISD::Suld1DI16Zero; 3341 case Intrinsic::nvvm_suld_1d_i32_zero: 3342 return NVPTXISD::Suld1DI32Zero; 3343 case Intrinsic::nvvm_suld_1d_i64_zero: 3344 return NVPTXISD::Suld1DI64Zero; 3345 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3346 return NVPTXISD::Suld1DV2I8Zero; 3347 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3348 return NVPTXISD::Suld1DV2I16Zero; 3349 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3350 return NVPTXISD::Suld1DV2I32Zero; 3351 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3352 return NVPTXISD::Suld1DV2I64Zero; 3353 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3354 return NVPTXISD::Suld1DV4I8Zero; 3355 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3356 return NVPTXISD::Suld1DV4I16Zero; 3357 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3358 return NVPTXISD::Suld1DV4I32Zero; 3359 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3360 return NVPTXISD::Suld1DArrayI8Zero; 3361 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3362 return NVPTXISD::Suld1DArrayI16Zero; 3363 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3364 return NVPTXISD::Suld1DArrayI32Zero; 3365 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3366 return NVPTXISD::Suld1DArrayI64Zero; 3367 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3368 return NVPTXISD::Suld1DArrayV2I8Zero; 3369 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3370 return NVPTXISD::Suld1DArrayV2I16Zero; 3371 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3372 return NVPTXISD::Suld1DArrayV2I32Zero; 3373 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3374 return NVPTXISD::Suld1DArrayV2I64Zero; 3375 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3376 return NVPTXISD::Suld1DArrayV4I8Zero; 3377 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3378 return NVPTXISD::Suld1DArrayV4I16Zero; 3379 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3380 return NVPTXISD::Suld1DArrayV4I32Zero; 3381 case Intrinsic::nvvm_suld_2d_i8_zero: 3382 return NVPTXISD::Suld2DI8Zero; 3383 case Intrinsic::nvvm_suld_2d_i16_zero: 3384 return NVPTXISD::Suld2DI16Zero; 3385 case Intrinsic::nvvm_suld_2d_i32_zero: 3386 return NVPTXISD::Suld2DI32Zero; 3387 case Intrinsic::nvvm_suld_2d_i64_zero: 3388 return NVPTXISD::Suld2DI64Zero; 3389 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3390 return NVPTXISD::Suld2DV2I8Zero; 3391 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3392 return NVPTXISD::Suld2DV2I16Zero; 3393 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3394 return NVPTXISD::Suld2DV2I32Zero; 3395 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3396 return NVPTXISD::Suld2DV2I64Zero; 3397 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3398 return NVPTXISD::Suld2DV4I8Zero; 3399 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3400 return NVPTXISD::Suld2DV4I16Zero; 3401 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3402 return NVPTXISD::Suld2DV4I32Zero; 3403 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3404 return NVPTXISD::Suld2DArrayI8Zero; 3405 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3406 return NVPTXISD::Suld2DArrayI16Zero; 3407 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3408 return NVPTXISD::Suld2DArrayI32Zero; 3409 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3410 return NVPTXISD::Suld2DArrayI64Zero; 3411 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3412 return NVPTXISD::Suld2DArrayV2I8Zero; 3413 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3414 return NVPTXISD::Suld2DArrayV2I16Zero; 3415 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3416 return NVPTXISD::Suld2DArrayV2I32Zero; 3417 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3418 return NVPTXISD::Suld2DArrayV2I64Zero; 3419 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3420 return NVPTXISD::Suld2DArrayV4I8Zero; 3421 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3422 return NVPTXISD::Suld2DArrayV4I16Zero; 3423 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3424 return NVPTXISD::Suld2DArrayV4I32Zero; 3425 case Intrinsic::nvvm_suld_3d_i8_zero: 3426 return NVPTXISD::Suld3DI8Zero; 3427 case Intrinsic::nvvm_suld_3d_i16_zero: 3428 return NVPTXISD::Suld3DI16Zero; 3429 case Intrinsic::nvvm_suld_3d_i32_zero: 3430 return NVPTXISD::Suld3DI32Zero; 3431 case Intrinsic::nvvm_suld_3d_i64_zero: 3432 return NVPTXISD::Suld3DI64Zero; 3433 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3434 return NVPTXISD::Suld3DV2I8Zero; 3435 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3436 return NVPTXISD::Suld3DV2I16Zero; 3437 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3438 return NVPTXISD::Suld3DV2I32Zero; 3439 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3440 return NVPTXISD::Suld3DV2I64Zero; 3441 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3442 return NVPTXISD::Suld3DV4I8Zero; 3443 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3444 return NVPTXISD::Suld3DV4I16Zero; 3445 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3446 return NVPTXISD::Suld3DV4I32Zero; 3447 } 3448 } 3449 3450 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3451 // TgtMemIntrinsic 3452 // because we need the information that is only available in the "Value" type 3453 // of destination 3454 // pointer. In particular, the address space information. 3455 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3456 IntrinsicInfo &Info, const CallInst &I, 3457 MachineFunction &MF, unsigned Intrinsic) const { 3458 switch (Intrinsic) { 3459 default: 3460 return false; 3461 case Intrinsic::nvvm_match_all_sync_i32p: 3462 case Intrinsic::nvvm_match_all_sync_i64p: 3463 Info.opc = ISD::INTRINSIC_W_CHAIN; 3464 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3465 // in order to model data exchange with other threads, but perform no real 3466 // memory accesses. 3467 Info.memVT = MVT::i1; 3468 3469 // Our result depends on both our and other thread's arguments. 3470 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3471 return true; 3472 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3473 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3474 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3475 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3476 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3477 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3478 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3479 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3480 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3481 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3482 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3483 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3484 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3485 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3486 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3487 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3488 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3489 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3490 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3491 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3492 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3493 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3494 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3495 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3496 Info.opc = ISD::INTRINSIC_W_CHAIN; 3497 Info.memVT = MVT::v8f16; 3498 Info.ptrVal = I.getArgOperand(0); 3499 Info.offset = 0; 3500 Info.flags = MachineMemOperand::MOLoad; 3501 Info.align = Align(16); 3502 return true; 3503 } 3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3505 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3506 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3507 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3508 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3509 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3510 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3511 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3513 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3514 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3515 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3516 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3517 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3518 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3519 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: { 3520 Info.opc = ISD::INTRINSIC_W_CHAIN; 3521 Info.memVT = MVT::v2i32; 3522 Info.ptrVal = I.getArgOperand(0); 3523 Info.offset = 0; 3524 Info.flags = MachineMemOperand::MOLoad; 3525 Info.align = Align(8); 3526 return true; 3527 } 3528 3529 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3530 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3531 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3532 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3533 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3534 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3535 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3536 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3537 3538 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3539 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3540 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3541 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3542 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3543 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3544 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3545 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: { 3546 Info.opc = ISD::INTRINSIC_W_CHAIN; 3547 Info.memVT = MVT::v4i32; 3548 Info.ptrVal = I.getArgOperand(0); 3549 Info.offset = 0; 3550 Info.flags = MachineMemOperand::MOLoad; 3551 Info.align = Align(16); 3552 return true; 3553 } 3554 3555 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3556 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3557 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3558 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3559 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3560 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3561 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3562 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3563 3564 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3565 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3566 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3567 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3568 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3569 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3570 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3571 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3572 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3573 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3574 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3575 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3576 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3577 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3578 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3579 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3580 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3581 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3582 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3583 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: { 3584 Info.opc = ISD::INTRINSIC_W_CHAIN; 3585 Info.memVT = MVT::i32; 3586 Info.ptrVal = I.getArgOperand(0); 3587 Info.offset = 0; 3588 Info.flags = MachineMemOperand::MOLoad; 3589 Info.align = Align(4); 3590 return true; 3591 } 3592 3593 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3594 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3595 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3596 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3597 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3598 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3599 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3600 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3601 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3602 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3603 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3604 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3605 Info.opc = ISD::INTRINSIC_W_CHAIN; 3606 Info.memVT = MVT::v4f16; 3607 Info.ptrVal = I.getArgOperand(0); 3608 Info.offset = 0; 3609 Info.flags = MachineMemOperand::MOLoad; 3610 Info.align = Align(16); 3611 return true; 3612 } 3613 3614 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3615 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3617 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3618 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3619 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3621 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3622 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3623 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3625 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: { 3626 Info.opc = ISD::INTRINSIC_W_CHAIN; 3627 Info.memVT = MVT::v8f32; 3628 Info.ptrVal = I.getArgOperand(0); 3629 Info.offset = 0; 3630 Info.flags = MachineMemOperand::MOLoad; 3631 Info.align = Align(16); 3632 return true; 3633 } 3634 3635 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3636 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3637 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3638 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3639 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3640 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3641 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3642 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3643 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3644 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3645 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3646 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3647 Info.opc = ISD::INTRINSIC_W_CHAIN; 3648 Info.memVT = MVT::v8i32; 3649 Info.ptrVal = I.getArgOperand(0); 3650 Info.offset = 0; 3651 Info.flags = MachineMemOperand::MOLoad; 3652 Info.align = Align(16); 3653 return true; 3654 } 3655 3656 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3657 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3658 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3659 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3660 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3661 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3662 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3663 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: { 3664 Info.opc = ISD::INTRINSIC_W_CHAIN; 3665 Info.memVT = MVT::v2i32; 3666 Info.ptrVal = I.getArgOperand(0); 3667 Info.offset = 0; 3668 Info.flags = MachineMemOperand::MOLoad; 3669 Info.align = Align(8); 3670 return true; 3671 } 3672 3673 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3674 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3675 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3676 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3677 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3678 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3679 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3680 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3681 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3682 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3683 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3684 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3685 Info.opc = ISD::INTRINSIC_VOID; 3686 Info.memVT = MVT::v4f16; 3687 Info.ptrVal = I.getArgOperand(0); 3688 Info.offset = 0; 3689 Info.flags = MachineMemOperand::MOStore; 3690 Info.align = Align(16); 3691 return true; 3692 } 3693 3694 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3697 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3698 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3701 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3702 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3705 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: { 3706 Info.opc = ISD::INTRINSIC_VOID; 3707 Info.memVT = MVT::v8f32; 3708 Info.ptrVal = I.getArgOperand(0); 3709 Info.offset = 0; 3710 Info.flags = MachineMemOperand::MOStore; 3711 Info.align = Align(16); 3712 return true; 3713 } 3714 3715 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3718 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3719 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3722 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3723 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3726 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3727 Info.opc = ISD::INTRINSIC_VOID; 3728 Info.memVT = MVT::v8i32; 3729 Info.ptrVal = I.getArgOperand(0); 3730 Info.offset = 0; 3731 Info.flags = MachineMemOperand::MOStore; 3732 Info.align = Align(16); 3733 return true; 3734 } 3735 3736 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3737 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3738 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3739 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3740 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3741 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3742 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3743 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3744 Info.opc = ISD::INTRINSIC_VOID; 3745 Info.memVT = MVT::v2i32; 3746 Info.ptrVal = I.getArgOperand(0); 3747 Info.offset = 0; 3748 Info.flags = MachineMemOperand::MOStore; 3749 Info.align = Align(8); 3750 return true; 3751 } 3752 3753 case Intrinsic::nvvm_atomic_load_inc_32: 3754 case Intrinsic::nvvm_atomic_load_dec_32: 3755 3756 case Intrinsic::nvvm_atomic_add_gen_f_cta: 3757 case Intrinsic::nvvm_atomic_add_gen_f_sys: 3758 case Intrinsic::nvvm_atomic_add_gen_i_cta: 3759 case Intrinsic::nvvm_atomic_add_gen_i_sys: 3760 case Intrinsic::nvvm_atomic_and_gen_i_cta: 3761 case Intrinsic::nvvm_atomic_and_gen_i_sys: 3762 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 3763 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 3764 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 3765 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 3766 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 3767 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 3768 case Intrinsic::nvvm_atomic_max_gen_i_cta: 3769 case Intrinsic::nvvm_atomic_max_gen_i_sys: 3770 case Intrinsic::nvvm_atomic_min_gen_i_cta: 3771 case Intrinsic::nvvm_atomic_min_gen_i_sys: 3772 case Intrinsic::nvvm_atomic_or_gen_i_cta: 3773 case Intrinsic::nvvm_atomic_or_gen_i_sys: 3774 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 3775 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 3776 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 3777 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 3778 auto &DL = I.getModule()->getDataLayout(); 3779 Info.opc = ISD::INTRINSIC_W_CHAIN; 3780 Info.memVT = getValueType(DL, I.getType()); 3781 Info.ptrVal = I.getArgOperand(0); 3782 Info.offset = 0; 3783 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3784 Info.align.reset(); 3785 return true; 3786 } 3787 3788 case Intrinsic::nvvm_ldu_global_i: 3789 case Intrinsic::nvvm_ldu_global_f: 3790 case Intrinsic::nvvm_ldu_global_p: { 3791 auto &DL = I.getModule()->getDataLayout(); 3792 Info.opc = ISD::INTRINSIC_W_CHAIN; 3793 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 3794 Info.memVT = getValueType(DL, I.getType()); 3795 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 3796 Info.memVT = getPointerTy(DL); 3797 else 3798 Info.memVT = getValueType(DL, I.getType()); 3799 Info.ptrVal = I.getArgOperand(0); 3800 Info.offset = 0; 3801 Info.flags = MachineMemOperand::MOLoad; 3802 Info.align = 3803 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); 3804 3805 return true; 3806 } 3807 case Intrinsic::nvvm_ldg_global_i: 3808 case Intrinsic::nvvm_ldg_global_f: 3809 case Intrinsic::nvvm_ldg_global_p: { 3810 auto &DL = I.getModule()->getDataLayout(); 3811 3812 Info.opc = ISD::INTRINSIC_W_CHAIN; 3813 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 3814 Info.memVT = getValueType(DL, I.getType()); 3815 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 3816 Info.memVT = getPointerTy(DL); 3817 else 3818 Info.memVT = getValueType(DL, I.getType()); 3819 Info.ptrVal = I.getArgOperand(0); 3820 Info.offset = 0; 3821 Info.flags = MachineMemOperand::MOLoad; 3822 Info.align = 3823 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); 3824 3825 return true; 3826 } 3827 3828 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3829 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3830 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3831 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3832 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3833 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3834 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3835 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3836 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3837 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3838 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3839 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3840 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3841 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3842 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3843 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3844 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3845 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3846 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3847 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3848 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3849 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3850 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3851 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3852 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3853 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3854 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3855 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3856 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3857 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3858 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3859 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3860 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3861 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3862 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3863 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3864 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3865 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3866 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3867 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3868 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3869 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3870 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3871 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3872 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3873 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3874 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3875 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3876 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3877 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3878 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3879 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3880 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3881 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3882 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3883 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3884 Info.opc = getOpcForTextureInstr(Intrinsic); 3885 Info.memVT = MVT::v4f32; 3886 Info.ptrVal = nullptr; 3887 Info.offset = 0; 3888 Info.flags = MachineMemOperand::MOLoad; 3889 Info.align = Align(16); 3890 return true; 3891 3892 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3893 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3894 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3895 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3896 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3897 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3898 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3899 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3900 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3901 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3902 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3903 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3904 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3905 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3906 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3907 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3908 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3909 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3910 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3911 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3912 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3913 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3914 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3915 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3916 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3917 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3918 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3919 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3920 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3921 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3922 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3923 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3924 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3925 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3926 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3927 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3928 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3929 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3930 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3931 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3932 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3933 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3934 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3935 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3936 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3937 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3938 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3939 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3940 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3941 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3942 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3943 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3944 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3945 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3946 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3947 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3948 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3949 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3950 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3951 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3952 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3953 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3954 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3955 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3956 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3957 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3958 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3959 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3960 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3961 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3962 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3963 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3964 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3965 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3966 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3967 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3968 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3969 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3970 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3971 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3972 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3973 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3974 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3975 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3976 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3977 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3978 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3979 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3980 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3981 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3982 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3983 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3984 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3985 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3986 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3987 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3988 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3989 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3990 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3991 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3992 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3993 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3994 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3995 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3996 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3997 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3998 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3999 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4000 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4001 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4002 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4003 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4004 Info.opc = getOpcForTextureInstr(Intrinsic); 4005 Info.memVT = MVT::v4i32; 4006 Info.ptrVal = nullptr; 4007 Info.offset = 0; 4008 Info.flags = MachineMemOperand::MOLoad; 4009 Info.align = Align(16); 4010 return true; 4011 4012 case Intrinsic::nvvm_suld_1d_i8_clamp: 4013 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4014 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4015 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4016 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4017 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4018 case Intrinsic::nvvm_suld_2d_i8_clamp: 4019 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4020 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4021 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4022 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4023 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4024 case Intrinsic::nvvm_suld_3d_i8_clamp: 4025 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4026 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4027 case Intrinsic::nvvm_suld_1d_i8_trap: 4028 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4029 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4030 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4031 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4032 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4033 case Intrinsic::nvvm_suld_2d_i8_trap: 4034 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4035 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4036 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4037 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4038 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4039 case Intrinsic::nvvm_suld_3d_i8_trap: 4040 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4041 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4042 case Intrinsic::nvvm_suld_1d_i8_zero: 4043 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4044 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4045 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4046 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4047 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4048 case Intrinsic::nvvm_suld_2d_i8_zero: 4049 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4050 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4051 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4052 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4053 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4054 case Intrinsic::nvvm_suld_3d_i8_zero: 4055 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4056 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4057 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4058 Info.memVT = MVT::i8; 4059 Info.ptrVal = nullptr; 4060 Info.offset = 0; 4061 Info.flags = MachineMemOperand::MOLoad; 4062 Info.align = Align(16); 4063 return true; 4064 4065 case Intrinsic::nvvm_suld_1d_i16_clamp: 4066 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4067 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4068 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4069 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4070 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4071 case Intrinsic::nvvm_suld_2d_i16_clamp: 4072 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4073 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4074 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4075 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4076 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4077 case Intrinsic::nvvm_suld_3d_i16_clamp: 4078 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4079 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4080 case Intrinsic::nvvm_suld_1d_i16_trap: 4081 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4082 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4083 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4084 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4085 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4086 case Intrinsic::nvvm_suld_2d_i16_trap: 4087 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4088 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4089 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4090 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4091 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4092 case Intrinsic::nvvm_suld_3d_i16_trap: 4093 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4094 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4095 case Intrinsic::nvvm_suld_1d_i16_zero: 4096 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4097 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4098 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4099 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4100 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4101 case Intrinsic::nvvm_suld_2d_i16_zero: 4102 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4103 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4104 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4105 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4106 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4107 case Intrinsic::nvvm_suld_3d_i16_zero: 4108 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4109 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4110 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4111 Info.memVT = MVT::i16; 4112 Info.ptrVal = nullptr; 4113 Info.offset = 0; 4114 Info.flags = MachineMemOperand::MOLoad; 4115 Info.align = Align(16); 4116 return true; 4117 4118 case Intrinsic::nvvm_suld_1d_i32_clamp: 4119 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4120 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4121 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4122 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4123 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4124 case Intrinsic::nvvm_suld_2d_i32_clamp: 4125 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4126 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4127 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4128 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4129 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4130 case Intrinsic::nvvm_suld_3d_i32_clamp: 4131 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4132 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4133 case Intrinsic::nvvm_suld_1d_i32_trap: 4134 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4135 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4136 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4137 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4138 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4139 case Intrinsic::nvvm_suld_2d_i32_trap: 4140 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4141 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4142 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4143 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4144 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4145 case Intrinsic::nvvm_suld_3d_i32_trap: 4146 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4147 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4148 case Intrinsic::nvvm_suld_1d_i32_zero: 4149 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4150 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4151 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4152 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4153 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4154 case Intrinsic::nvvm_suld_2d_i32_zero: 4155 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4156 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4157 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4158 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4159 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4160 case Intrinsic::nvvm_suld_3d_i32_zero: 4161 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4162 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4163 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4164 Info.memVT = MVT::i32; 4165 Info.ptrVal = nullptr; 4166 Info.offset = 0; 4167 Info.flags = MachineMemOperand::MOLoad; 4168 Info.align = Align(16); 4169 return true; 4170 4171 case Intrinsic::nvvm_suld_1d_i64_clamp: 4172 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4173 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4174 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4175 case Intrinsic::nvvm_suld_2d_i64_clamp: 4176 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4177 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4178 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4179 case Intrinsic::nvvm_suld_3d_i64_clamp: 4180 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4181 case Intrinsic::nvvm_suld_1d_i64_trap: 4182 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4183 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4184 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4185 case Intrinsic::nvvm_suld_2d_i64_trap: 4186 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4187 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4188 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4189 case Intrinsic::nvvm_suld_3d_i64_trap: 4190 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4191 case Intrinsic::nvvm_suld_1d_i64_zero: 4192 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4193 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4194 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4195 case Intrinsic::nvvm_suld_2d_i64_zero: 4196 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4197 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4198 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4199 case Intrinsic::nvvm_suld_3d_i64_zero: 4200 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4201 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4202 Info.memVT = MVT::i64; 4203 Info.ptrVal = nullptr; 4204 Info.offset = 0; 4205 Info.flags = MachineMemOperand::MOLoad; 4206 Info.align = Align(16); 4207 return true; 4208 } 4209 return false; 4210 } 4211 4212 /// isLegalAddressingMode - Return true if the addressing mode represented 4213 /// by AM is legal for this target, for a load/store of the specified type. 4214 /// Used to guide target specific optimizations, like loop strength reduction 4215 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4216 /// (CodeGenPrepare.cpp) 4217 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4218 const AddrMode &AM, Type *Ty, 4219 unsigned AS, Instruction *I) const { 4220 // AddrMode - This represents an addressing mode of: 4221 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4222 // 4223 // The legal address modes are 4224 // - [avar] 4225 // - [areg] 4226 // - [areg+immoff] 4227 // - [immAddr] 4228 4229 if (AM.BaseGV) { 4230 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4231 } 4232 4233 switch (AM.Scale) { 4234 case 0: // "r", "r+i" or "i" is allowed 4235 break; 4236 case 1: 4237 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4238 return false; 4239 // Otherwise we have r+i. 4240 break; 4241 default: 4242 // No scale > 1 is allowed 4243 return false; 4244 } 4245 return true; 4246 } 4247 4248 //===----------------------------------------------------------------------===// 4249 // NVPTX Inline Assembly Support 4250 //===----------------------------------------------------------------------===// 4251 4252 /// getConstraintType - Given a constraint letter, return the type of 4253 /// constraint it is for this target. 4254 NVPTXTargetLowering::ConstraintType 4255 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4256 if (Constraint.size() == 1) { 4257 switch (Constraint[0]) { 4258 default: 4259 break; 4260 case 'b': 4261 case 'r': 4262 case 'h': 4263 case 'c': 4264 case 'l': 4265 case 'f': 4266 case 'd': 4267 case '0': 4268 case 'N': 4269 return C_RegisterClass; 4270 } 4271 } 4272 return TargetLowering::getConstraintType(Constraint); 4273 } 4274 4275 std::pair<unsigned, const TargetRegisterClass *> 4276 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4277 StringRef Constraint, 4278 MVT VT) const { 4279 if (Constraint.size() == 1) { 4280 switch (Constraint[0]) { 4281 case 'b': 4282 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4283 case 'c': 4284 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4285 case 'h': 4286 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4287 case 'r': 4288 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4289 case 'l': 4290 case 'N': 4291 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4292 case 'f': 4293 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4294 case 'd': 4295 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4296 } 4297 } 4298 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4299 } 4300 4301 //===----------------------------------------------------------------------===// 4302 // NVPTX DAG Combining 4303 //===----------------------------------------------------------------------===// 4304 4305 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4306 CodeGenOpt::Level OptLevel) const { 4307 // Always honor command-line argument 4308 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4309 return FMAContractLevelOpt > 0; 4310 4311 // Do not contract if we're not optimizing the code. 4312 if (OptLevel == 0) 4313 return false; 4314 4315 // Honor TargetOptions flags that explicitly say fusion is okay. 4316 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4317 return true; 4318 4319 return allowUnsafeFPMath(MF); 4320 } 4321 4322 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4323 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4324 if (MF.getTarget().Options.UnsafeFPMath) 4325 return true; 4326 4327 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4328 const Function &F = MF.getFunction(); 4329 if (F.hasFnAttribute("unsafe-fp-math")) { 4330 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 4331 StringRef Val = Attr.getValueAsString(); 4332 if (Val == "true") 4333 return true; 4334 } 4335 4336 return false; 4337 } 4338 4339 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4340 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4341 /// called with the default operands, and if that fails, with commuted 4342 /// operands. 4343 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4344 TargetLowering::DAGCombinerInfo &DCI, 4345 const NVPTXSubtarget &Subtarget, 4346 CodeGenOpt::Level OptLevel) { 4347 SelectionDAG &DAG = DCI.DAG; 4348 // Skip non-integer, non-scalar case 4349 EVT VT=N0.getValueType(); 4350 if (VT.isVector()) 4351 return SDValue(); 4352 4353 // fold (add (mul a, b), c) -> (mad a, b, c) 4354 // 4355 if (N0.getOpcode() == ISD::MUL) { 4356 assert (VT.isInteger()); 4357 // For integer: 4358 // Since integer multiply-add costs the same as integer multiply 4359 // but is more costly than integer add, do the fusion only when 4360 // the mul is only used in the add. 4361 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4362 !N0.getNode()->hasOneUse()) 4363 return SDValue(); 4364 4365 // Do the folding 4366 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4367 N0.getOperand(0), N0.getOperand(1), N1); 4368 } 4369 else if (N0.getOpcode() == ISD::FMUL) { 4370 if (VT == MVT::f32 || VT == MVT::f64) { 4371 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4372 &DAG.getTargetLoweringInfo()); 4373 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4374 return SDValue(); 4375 4376 // For floating point: 4377 // Do the fusion only when the mul has less than 5 uses and all 4378 // are add. 4379 // The heuristic is that if a use is not an add, then that use 4380 // cannot be fused into fma, therefore mul is still needed anyway. 4381 // If there are more than 4 uses, even if they are all add, fusing 4382 // them will increase register pressue. 4383 // 4384 int numUses = 0; 4385 int nonAddCount = 0; 4386 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 4387 UE = N0.getNode()->use_end(); 4388 UI != UE; ++UI) { 4389 numUses++; 4390 SDNode *User = *UI; 4391 if (User->getOpcode() != ISD::FADD) 4392 ++nonAddCount; 4393 } 4394 if (numUses >= 5) 4395 return SDValue(); 4396 if (nonAddCount) { 4397 int orderNo = N->getIROrder(); 4398 int orderNo2 = N0.getNode()->getIROrder(); 4399 // simple heuristics here for considering potential register 4400 // pressure, the logics here is that the differnce are used 4401 // to measure the distance between def and use, the longer distance 4402 // more likely cause register pressure. 4403 if (orderNo - orderNo2 < 500) 4404 return SDValue(); 4405 4406 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4407 // which guarantees that the FMA will not increase register pressure at node N. 4408 bool opIsLive = false; 4409 const SDNode *left = N0.getOperand(0).getNode(); 4410 const SDNode *right = N0.getOperand(1).getNode(); 4411 4412 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4413 opIsLive = true; 4414 4415 if (!opIsLive) 4416 for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { 4417 SDNode *User = *UI; 4418 int orderNo3 = User->getIROrder(); 4419 if (orderNo3 > orderNo) { 4420 opIsLive = true; 4421 break; 4422 } 4423 } 4424 4425 if (!opIsLive) 4426 for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { 4427 SDNode *User = *UI; 4428 int orderNo3 = User->getIROrder(); 4429 if (orderNo3 > orderNo) { 4430 opIsLive = true; 4431 break; 4432 } 4433 } 4434 4435 if (!opIsLive) 4436 return SDValue(); 4437 } 4438 4439 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4440 N0.getOperand(0), N0.getOperand(1), N1); 4441 } 4442 } 4443 4444 return SDValue(); 4445 } 4446 4447 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4448 /// 4449 static SDValue PerformADDCombine(SDNode *N, 4450 TargetLowering::DAGCombinerInfo &DCI, 4451 const NVPTXSubtarget &Subtarget, 4452 CodeGenOpt::Level OptLevel) { 4453 SDValue N0 = N->getOperand(0); 4454 SDValue N1 = N->getOperand(1); 4455 4456 // First try with the default operand order. 4457 if (SDValue Result = 4458 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4459 return Result; 4460 4461 // If that didn't work, try again with the operands commuted. 4462 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4463 } 4464 4465 static SDValue PerformANDCombine(SDNode *N, 4466 TargetLowering::DAGCombinerInfo &DCI) { 4467 // The type legalizer turns a vector load of i8 values into a zextload to i16 4468 // registers, optionally ANY_EXTENDs it (if target type is integer), 4469 // and ANDs off the high 8 bits. Since we turn this load into a 4470 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4471 // nodes. Do that here. 4472 SDValue Val = N->getOperand(0); 4473 SDValue Mask = N->getOperand(1); 4474 4475 if (isa<ConstantSDNode>(Val)) { 4476 std::swap(Val, Mask); 4477 } 4478 4479 SDValue AExt; 4480 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4481 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4482 AExt = Val; 4483 Val = Val->getOperand(0); 4484 } 4485 4486 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4487 Val = Val->getOperand(0); 4488 } 4489 4490 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4491 Val->getOpcode() == NVPTXISD::LoadV4) { 4492 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4493 if (!MaskCnst) { 4494 // Not an AND with a constant 4495 return SDValue(); 4496 } 4497 4498 uint64_t MaskVal = MaskCnst->getZExtValue(); 4499 if (MaskVal != 0xff) { 4500 // Not an AND that chops off top 8 bits 4501 return SDValue(); 4502 } 4503 4504 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4505 if (!Mem) { 4506 // Not a MemSDNode?!? 4507 return SDValue(); 4508 } 4509 4510 EVT MemVT = Mem->getMemoryVT(); 4511 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4512 // We only handle the i8 case 4513 return SDValue(); 4514 } 4515 4516 unsigned ExtType = 4517 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4518 getZExtValue(); 4519 if (ExtType == ISD::SEXTLOAD) { 4520 // If for some reason the load is a sextload, the and is needed to zero 4521 // out the high 8 bits 4522 return SDValue(); 4523 } 4524 4525 bool AddTo = false; 4526 if (AExt.getNode() != nullptr) { 4527 // Re-insert the ext as a zext. 4528 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4529 AExt.getValueType(), Val); 4530 AddTo = true; 4531 } 4532 4533 // If we get here, the AND is unnecessary. Just replace it with the load 4534 DCI.CombineTo(N, Val, AddTo); 4535 } 4536 4537 return SDValue(); 4538 } 4539 4540 static SDValue PerformREMCombine(SDNode *N, 4541 TargetLowering::DAGCombinerInfo &DCI, 4542 CodeGenOpt::Level OptLevel) { 4543 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4544 4545 // Don't do anything at less than -O2. 4546 if (OptLevel < CodeGenOpt::Default) 4547 return SDValue(); 4548 4549 SelectionDAG &DAG = DCI.DAG; 4550 SDLoc DL(N); 4551 EVT VT = N->getValueType(0); 4552 bool IsSigned = N->getOpcode() == ISD::SREM; 4553 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4554 4555 const SDValue &Num = N->getOperand(0); 4556 const SDValue &Den = N->getOperand(1); 4557 4558 for (const SDNode *U : Num->uses()) { 4559 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4560 U->getOperand(1) == Den) { 4561 // Num % Den -> Num - (Num / Den) * Den 4562 return DAG.getNode(ISD::SUB, DL, VT, Num, 4563 DAG.getNode(ISD::MUL, DL, VT, 4564 DAG.getNode(DivOpc, DL, VT, Num, Den), 4565 Den)); 4566 } 4567 } 4568 return SDValue(); 4569 } 4570 4571 enum OperandSignedness { 4572 Signed = 0, 4573 Unsigned, 4574 Unknown 4575 }; 4576 4577 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4578 /// that can be demoted to \p OptSize bits without loss of information. The 4579 /// signedness of the operand, if determinable, is placed in \p S. 4580 static bool IsMulWideOperandDemotable(SDValue Op, 4581 unsigned OptSize, 4582 OperandSignedness &S) { 4583 S = Unknown; 4584 4585 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4586 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4587 EVT OrigVT = Op.getOperand(0).getValueType(); 4588 if (OrigVT.getSizeInBits() <= OptSize) { 4589 S = Signed; 4590 return true; 4591 } 4592 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4593 EVT OrigVT = Op.getOperand(0).getValueType(); 4594 if (OrigVT.getSizeInBits() <= OptSize) { 4595 S = Unsigned; 4596 return true; 4597 } 4598 } 4599 4600 return false; 4601 } 4602 4603 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4604 /// be demoted to \p OptSize bits without loss of information. If the operands 4605 /// contain a constant, it should appear as the RHS operand. The signedness of 4606 /// the operands is placed in \p IsSigned. 4607 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4608 unsigned OptSize, 4609 bool &IsSigned) { 4610 OperandSignedness LHSSign; 4611 4612 // The LHS operand must be a demotable op 4613 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4614 return false; 4615 4616 // We should have been able to determine the signedness from the LHS 4617 if (LHSSign == Unknown) 4618 return false; 4619 4620 IsSigned = (LHSSign == Signed); 4621 4622 // The RHS can be a demotable op or a constant 4623 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4624 const APInt &Val = CI->getAPIntValue(); 4625 if (LHSSign == Unsigned) { 4626 return Val.isIntN(OptSize); 4627 } else { 4628 return Val.isSignedIntN(OptSize); 4629 } 4630 } else { 4631 OperandSignedness RHSSign; 4632 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4633 return false; 4634 4635 return LHSSign == RHSSign; 4636 } 4637 } 4638 4639 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4640 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4641 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4642 /// amount. 4643 static SDValue TryMULWIDECombine(SDNode *N, 4644 TargetLowering::DAGCombinerInfo &DCI) { 4645 EVT MulType = N->getValueType(0); 4646 if (MulType != MVT::i32 && MulType != MVT::i64) { 4647 return SDValue(); 4648 } 4649 4650 SDLoc DL(N); 4651 unsigned OptSize = MulType.getSizeInBits() >> 1; 4652 SDValue LHS = N->getOperand(0); 4653 SDValue RHS = N->getOperand(1); 4654 4655 // Canonicalize the multiply so the constant (if any) is on the right 4656 if (N->getOpcode() == ISD::MUL) { 4657 if (isa<ConstantSDNode>(LHS)) { 4658 std::swap(LHS, RHS); 4659 } 4660 } 4661 4662 // If we have a SHL, determine the actual multiply amount 4663 if (N->getOpcode() == ISD::SHL) { 4664 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4665 if (!ShlRHS) { 4666 return SDValue(); 4667 } 4668 4669 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4670 unsigned BitWidth = MulType.getSizeInBits(); 4671 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4672 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4673 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4674 } else { 4675 return SDValue(); 4676 } 4677 } 4678 4679 bool Signed; 4680 // Verify that our operands are demotable 4681 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4682 return SDValue(); 4683 } 4684 4685 EVT DemotedVT; 4686 if (MulType == MVT::i32) { 4687 DemotedVT = MVT::i16; 4688 } else { 4689 DemotedVT = MVT::i32; 4690 } 4691 4692 // Truncate the operands to the correct size. Note that these are just for 4693 // type consistency and will (likely) be eliminated in later phases. 4694 SDValue TruncLHS = 4695 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 4696 SDValue TruncRHS = 4697 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 4698 4699 unsigned Opc; 4700 if (Signed) { 4701 Opc = NVPTXISD::MUL_WIDE_SIGNED; 4702 } else { 4703 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 4704 } 4705 4706 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 4707 } 4708 4709 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 4710 static SDValue PerformMULCombine(SDNode *N, 4711 TargetLowering::DAGCombinerInfo &DCI, 4712 CodeGenOpt::Level OptLevel) { 4713 if (OptLevel > 0) { 4714 // Try mul.wide combining at OptLevel > 0 4715 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4716 return Ret; 4717 } 4718 4719 return SDValue(); 4720 } 4721 4722 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 4723 static SDValue PerformSHLCombine(SDNode *N, 4724 TargetLowering::DAGCombinerInfo &DCI, 4725 CodeGenOpt::Level OptLevel) { 4726 if (OptLevel > 0) { 4727 // Try mul.wide combining at OptLevel > 0 4728 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4729 return Ret; 4730 } 4731 4732 return SDValue(); 4733 } 4734 4735 static SDValue PerformSETCCCombine(SDNode *N, 4736 TargetLowering::DAGCombinerInfo &DCI) { 4737 EVT CCType = N->getValueType(0); 4738 SDValue A = N->getOperand(0); 4739 SDValue B = N->getOperand(1); 4740 4741 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 4742 return SDValue(); 4743 4744 SDLoc DL(N); 4745 // setp.f16x2 returns two scalar predicates, which we need to 4746 // convert back to v2i1. The returned result will be scalarized by 4747 // the legalizer, but the comparison will remain a single vector 4748 // instruction. 4749 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 4750 DCI.DAG.getVTList(MVT::i1, MVT::i1), 4751 {A, B, N->getOperand(2)}); 4752 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 4753 CCNode.getValue(1)); 4754 } 4755 4756 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 4757 DAGCombinerInfo &DCI) const { 4758 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 4759 switch (N->getOpcode()) { 4760 default: break; 4761 case ISD::ADD: 4762 case ISD::FADD: 4763 return PerformADDCombine(N, DCI, STI, OptLevel); 4764 case ISD::MUL: 4765 return PerformMULCombine(N, DCI, OptLevel); 4766 case ISD::SHL: 4767 return PerformSHLCombine(N, DCI, OptLevel); 4768 case ISD::AND: 4769 return PerformANDCombine(N, DCI); 4770 case ISD::UREM: 4771 case ISD::SREM: 4772 return PerformREMCombine(N, DCI, OptLevel); 4773 case ISD::SETCC: 4774 return PerformSETCCCombine(N, DCI); 4775 } 4776 return SDValue(); 4777 } 4778 4779 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 4780 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 4781 SmallVectorImpl<SDValue> &Results) { 4782 EVT ResVT = N->getValueType(0); 4783 SDLoc DL(N); 4784 4785 assert(ResVT.isVector() && "Vector load must have vector type"); 4786 4787 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 4788 // legal. We can (and should) split that into 2 loads of <2 x double> here 4789 // but I'm leaving that as a TODO for now. 4790 assert(ResVT.isSimple() && "Can only handle simple types"); 4791 switch (ResVT.getSimpleVT().SimpleTy) { 4792 default: 4793 return; 4794 case MVT::v2i8: 4795 case MVT::v2i16: 4796 case MVT::v2i32: 4797 case MVT::v2i64: 4798 case MVT::v2f16: 4799 case MVT::v2f32: 4800 case MVT::v2f64: 4801 case MVT::v4i8: 4802 case MVT::v4i16: 4803 case MVT::v4i32: 4804 case MVT::v4f16: 4805 case MVT::v4f32: 4806 case MVT::v8f16: // <4 x f16x2> 4807 // This is a "native" vector type 4808 break; 4809 } 4810 4811 LoadSDNode *LD = cast<LoadSDNode>(N); 4812 4813 unsigned Align = LD->getAlignment(); 4814 auto &TD = DAG.getDataLayout(); 4815 unsigned PrefAlign = 4816 TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); 4817 if (Align < PrefAlign) { 4818 // This load is not sufficiently aligned, so bail out and let this vector 4819 // load be scalarized. Note that we may still be able to emit smaller 4820 // vector loads. For example, if we are loading a <4 x float> with an 4821 // alignment of 8, this check will fail but the legalizer will try again 4822 // with 2 x <2 x float>, which will succeed with an alignment of 8. 4823 return; 4824 } 4825 4826 EVT EltVT = ResVT.getVectorElementType(); 4827 unsigned NumElts = ResVT.getVectorNumElements(); 4828 4829 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 4830 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4831 // loaded type to i16 and propagate the "real" type as the memory type. 4832 bool NeedTrunc = false; 4833 if (EltVT.getSizeInBits() < 16) { 4834 EltVT = MVT::i16; 4835 NeedTrunc = true; 4836 } 4837 4838 unsigned Opcode = 0; 4839 SDVTList LdResVTs; 4840 bool LoadF16x2 = false; 4841 4842 switch (NumElts) { 4843 default: 4844 return; 4845 case 2: 4846 Opcode = NVPTXISD::LoadV2; 4847 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4848 break; 4849 case 4: { 4850 Opcode = NVPTXISD::LoadV4; 4851 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4852 LdResVTs = DAG.getVTList(ListVTs); 4853 break; 4854 } 4855 case 8: { 4856 // v8f16 is a special case. PTX doesn't have ld.v8.f16 4857 // instruction. Instead, we split the vector into v2f16 chunks and 4858 // load them with ld.v4.b32. 4859 assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); 4860 LoadF16x2 = true; 4861 Opcode = NVPTXISD::LoadV4; 4862 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, 4863 MVT::Other}; 4864 LdResVTs = DAG.getVTList(ListVTs); 4865 break; 4866 } 4867 } 4868 4869 // Copy regular operands 4870 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 4871 4872 // The select routine does not have access to the LoadSDNode instance, so 4873 // pass along the extension information 4874 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 4875 4876 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 4877 LD->getMemoryVT(), 4878 LD->getMemOperand()); 4879 4880 SmallVector<SDValue, 8> ScalarRes; 4881 if (LoadF16x2) { 4882 // Split v2f16 subvectors back into individual elements. 4883 NumElts /= 2; 4884 for (unsigned i = 0; i < NumElts; ++i) { 4885 SDValue SubVector = NewLD.getValue(i); 4886 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4887 DAG.getIntPtrConstant(0, DL)); 4888 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4889 DAG.getIntPtrConstant(1, DL)); 4890 ScalarRes.push_back(E0); 4891 ScalarRes.push_back(E1); 4892 } 4893 } else { 4894 for (unsigned i = 0; i < NumElts; ++i) { 4895 SDValue Res = NewLD.getValue(i); 4896 if (NeedTrunc) 4897 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 4898 ScalarRes.push_back(Res); 4899 } 4900 } 4901 4902 SDValue LoadChain = NewLD.getValue(NumElts); 4903 4904 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 4905 4906 Results.push_back(BuildVec); 4907 Results.push_back(LoadChain); 4908 } 4909 4910 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 4911 SmallVectorImpl<SDValue> &Results) { 4912 SDValue Chain = N->getOperand(0); 4913 SDValue Intrin = N->getOperand(1); 4914 SDLoc DL(N); 4915 4916 // Get the intrinsic ID 4917 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 4918 switch (IntrinNo) { 4919 default: 4920 return; 4921 case Intrinsic::nvvm_ldg_global_i: 4922 case Intrinsic::nvvm_ldg_global_f: 4923 case Intrinsic::nvvm_ldg_global_p: 4924 case Intrinsic::nvvm_ldu_global_i: 4925 case Intrinsic::nvvm_ldu_global_f: 4926 case Intrinsic::nvvm_ldu_global_p: { 4927 EVT ResVT = N->getValueType(0); 4928 4929 if (ResVT.isVector()) { 4930 // Vector LDG/LDU 4931 4932 unsigned NumElts = ResVT.getVectorNumElements(); 4933 EVT EltVT = ResVT.getVectorElementType(); 4934 4935 // Since LDU/LDG are target nodes, we cannot rely on DAG type 4936 // legalization. 4937 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4938 // loaded type to i16 and propagate the "real" type as the memory type. 4939 bool NeedTrunc = false; 4940 if (EltVT.getSizeInBits() < 16) { 4941 EltVT = MVT::i16; 4942 NeedTrunc = true; 4943 } 4944 4945 unsigned Opcode = 0; 4946 SDVTList LdResVTs; 4947 4948 switch (NumElts) { 4949 default: 4950 return; 4951 case 2: 4952 switch (IntrinNo) { 4953 default: 4954 return; 4955 case Intrinsic::nvvm_ldg_global_i: 4956 case Intrinsic::nvvm_ldg_global_f: 4957 case Intrinsic::nvvm_ldg_global_p: 4958 Opcode = NVPTXISD::LDGV2; 4959 break; 4960 case Intrinsic::nvvm_ldu_global_i: 4961 case Intrinsic::nvvm_ldu_global_f: 4962 case Intrinsic::nvvm_ldu_global_p: 4963 Opcode = NVPTXISD::LDUV2; 4964 break; 4965 } 4966 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4967 break; 4968 case 4: { 4969 switch (IntrinNo) { 4970 default: 4971 return; 4972 case Intrinsic::nvvm_ldg_global_i: 4973 case Intrinsic::nvvm_ldg_global_f: 4974 case Intrinsic::nvvm_ldg_global_p: 4975 Opcode = NVPTXISD::LDGV4; 4976 break; 4977 case Intrinsic::nvvm_ldu_global_i: 4978 case Intrinsic::nvvm_ldu_global_f: 4979 case Intrinsic::nvvm_ldu_global_p: 4980 Opcode = NVPTXISD::LDUV4; 4981 break; 4982 } 4983 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4984 LdResVTs = DAG.getVTList(ListVTs); 4985 break; 4986 } 4987 } 4988 4989 SmallVector<SDValue, 8> OtherOps; 4990 4991 // Copy regular operands 4992 4993 OtherOps.push_back(Chain); // Chain 4994 // Skip operand 1 (intrinsic ID) 4995 // Others 4996 OtherOps.append(N->op_begin() + 2, N->op_end()); 4997 4998 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 4999 5000 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5001 MemSD->getMemoryVT(), 5002 MemSD->getMemOperand()); 5003 5004 SmallVector<SDValue, 4> ScalarRes; 5005 5006 for (unsigned i = 0; i < NumElts; ++i) { 5007 SDValue Res = NewLD.getValue(i); 5008 if (NeedTrunc) 5009 Res = 5010 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5011 ScalarRes.push_back(Res); 5012 } 5013 5014 SDValue LoadChain = NewLD.getValue(NumElts); 5015 5016 SDValue BuildVec = 5017 DAG.getBuildVector(ResVT, DL, ScalarRes); 5018 5019 Results.push_back(BuildVec); 5020 Results.push_back(LoadChain); 5021 } else { 5022 // i8 LDG/LDU 5023 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5024 "Custom handling of non-i8 ldu/ldg?"); 5025 5026 // Just copy all operands as-is 5027 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5028 5029 // Force output to i16 5030 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5031 5032 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5033 5034 // We make sure the memory type is i8, which will be used during isel 5035 // to select the proper instruction. 5036 SDValue NewLD = 5037 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5038 MVT::i8, MemSD->getMemOperand()); 5039 5040 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5041 NewLD.getValue(0))); 5042 Results.push_back(NewLD.getValue(1)); 5043 } 5044 } 5045 } 5046 } 5047 5048 void NVPTXTargetLowering::ReplaceNodeResults( 5049 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5050 switch (N->getOpcode()) { 5051 default: 5052 report_fatal_error("Unhandled custom legalization"); 5053 case ISD::LOAD: 5054 ReplaceLoadVector(N, DAG, Results); 5055 return; 5056 case ISD::INTRINSIC_W_CHAIN: 5057 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5058 return; 5059 } 5060 } 5061 5062 // Pin NVPTXTargetObjectFile's vtables to this file. 5063 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} 5064 5065 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5066 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5067 return getDataSection(); 5068 } 5069