1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/CodeGen/Analysis.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/CodeGen/SelectionDAGNodes.h" 29 #include "llvm/CodeGen/TargetCallingConv.h" 30 #include "llvm/CodeGen/TargetLowering.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/Argument.h" 33 #include "llvm/IR/Attributes.h" 34 #include "llvm/IR/CallSite.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/GlobalValue.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/Type.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CodeGen.h" 47 #include "llvm/Support/CommandLine.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/MachineValueType.h" 50 #include "llvm/Support/MathExtras.h" 51 #include "llvm/Support/raw_ostream.h" 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <algorithm> 55 #include <cassert> 56 #include <cstdint> 57 #include <iterator> 58 #include <sstream> 59 #include <string> 60 #include <utility> 61 #include <vector> 62 63 #define DEBUG_TYPE "nvptx-lower" 64 65 using namespace llvm; 66 67 static unsigned int uniqueCallSite = 0; 68 69 static cl::opt<bool> sched4reg( 70 "nvptx-sched4reg", 71 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 72 73 static cl::opt<unsigned> 74 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, 75 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 76 " 1: do it 2: do it aggressively"), 77 cl::init(2)); 78 79 static cl::opt<int> UsePrecDivF32( 80 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, 81 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 82 " IEEE Compliant F32 div.rnd if available."), 83 cl::init(2)); 84 85 static cl::opt<bool> UsePrecSqrtF32( 86 "nvptx-prec-sqrtf32", cl::Hidden, 87 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 88 cl::init(true)); 89 90 static cl::opt<bool> FtzEnabled( 91 "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden, 92 cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."), 93 cl::init(false)); 94 95 int NVPTXTargetLowering::getDivF32Level() const { 96 if (UsePrecDivF32.getNumOccurrences() > 0) { 97 // If nvptx-prec-div32=N is used on the command-line, always honor it 98 return UsePrecDivF32; 99 } else { 100 // Otherwise, use div.approx if fast math is enabled 101 if (getTargetMachine().Options.UnsafeFPMath) 102 return 0; 103 else 104 return 2; 105 } 106 } 107 108 bool NVPTXTargetLowering::usePrecSqrtF32() const { 109 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 110 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 111 return UsePrecSqrtF32; 112 } else { 113 // Otherwise, use sqrt.approx if fast math is enabled 114 return !getTargetMachine().Options.UnsafeFPMath; 115 } 116 } 117 118 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 119 // TODO: Get rid of this flag; there can be only one way to do this. 120 if (FtzEnabled.getNumOccurrences() > 0) { 121 // If nvptx-f32ftz is used on the command-line, always honor it 122 return FtzEnabled; 123 } else { 124 const Function &F = MF.getFunction(); 125 // Otherwise, check for an nvptx-f32ftz attribute on the function 126 if (F.hasFnAttribute("nvptx-f32ftz")) 127 return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true"; 128 else 129 return false; 130 } 131 } 132 133 static bool IsPTXVectorType(MVT VT) { 134 switch (VT.SimpleTy) { 135 default: 136 return false; 137 case MVT::v2i1: 138 case MVT::v4i1: 139 case MVT::v2i8: 140 case MVT::v4i8: 141 case MVT::v2i16: 142 case MVT::v4i16: 143 case MVT::v2i32: 144 case MVT::v4i32: 145 case MVT::v2i64: 146 case MVT::v2f16: 147 case MVT::v4f16: 148 case MVT::v8f16: // <4 x f16x2> 149 case MVT::v2f32: 150 case MVT::v4f32: 151 case MVT::v2f64: 152 return true; 153 } 154 } 155 156 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 157 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 158 /// into their primitive components. 159 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 160 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 161 /// LowerCall, and LowerReturn. 162 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 163 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 164 SmallVectorImpl<uint64_t> *Offsets = nullptr, 165 uint64_t StartingOffset = 0) { 166 SmallVector<EVT, 16> TempVTs; 167 SmallVector<uint64_t, 16> TempOffsets; 168 169 // Special case for i128 - decompose to (i64, i64) 170 if (Ty->isIntegerTy(128)) { 171 ValueVTs.push_back(EVT(MVT::i64)); 172 ValueVTs.push_back(EVT(MVT::i64)); 173 174 if (Offsets) { 175 Offsets->push_back(StartingOffset + 0); 176 Offsets->push_back(StartingOffset + 8); 177 } 178 179 return; 180 } 181 182 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 183 if (StructType *STy = dyn_cast<StructType>(Ty)) { 184 auto const *SL = DL.getStructLayout(STy); 185 auto ElementNum = 0; 186 for(auto *EI : STy->elements()) { 187 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 188 StartingOffset + SL->getElementOffset(ElementNum)); 189 ++ElementNum; 190 } 191 return; 192 } 193 194 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 195 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 196 EVT VT = TempVTs[i]; 197 uint64_t Off = TempOffsets[i]; 198 // Split vectors into individual elements, except for v2f16, which 199 // we will pass as a single scalar. 200 if (VT.isVector()) { 201 unsigned NumElts = VT.getVectorNumElements(); 202 EVT EltVT = VT.getVectorElementType(); 203 // Vectors with an even number of f16 elements will be passed to 204 // us as an array of v2f16 elements. We must match this so we 205 // stay in sync with Ins/Outs. 206 if (EltVT == MVT::f16 && NumElts % 2 == 0) { 207 EltVT = MVT::v2f16; 208 NumElts /= 2; 209 } 210 for (unsigned j = 0; j != NumElts; ++j) { 211 ValueVTs.push_back(EltVT); 212 if (Offsets) 213 Offsets->push_back(Off + j * EltVT.getStoreSize()); 214 } 215 } else { 216 ValueVTs.push_back(VT); 217 if (Offsets) 218 Offsets->push_back(Off); 219 } 220 } 221 } 222 223 // Check whether we can merge loads/stores of some of the pieces of a 224 // flattened function parameter or return value into a single vector 225 // load/store. 226 // 227 // The flattened parameter is represented as a list of EVTs and 228 // offsets, and the whole structure is aligned to ParamAlignment. This 229 // function determines whether we can load/store pieces of the 230 // parameter starting at index Idx using a single vectorized op of 231 // size AccessSize. If so, it returns the number of param pieces 232 // covered by the vector op. Otherwise, it returns 1. 233 static unsigned CanMergeParamLoadStoresStartingAt( 234 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 235 const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) { 236 assert(isPowerOf2_32(AccessSize) && "must be a power of 2!"); 237 238 // Can't vectorize if param alignment is not sufficient. 239 if (AccessSize > ParamAlignment) 240 return 1; 241 // Can't vectorize if offset is not aligned. 242 if (Offsets[Idx] & (AccessSize - 1)) 243 return 1; 244 245 EVT EltVT = ValueVTs[Idx]; 246 unsigned EltSize = EltVT.getStoreSize(); 247 248 // Element is too large to vectorize. 249 if (EltSize >= AccessSize) 250 return 1; 251 252 unsigned NumElts = AccessSize / EltSize; 253 // Can't vectorize if AccessBytes if not a multiple of EltSize. 254 if (AccessSize != EltSize * NumElts) 255 return 1; 256 257 // We don't have enough elements to vectorize. 258 if (Idx + NumElts > ValueVTs.size()) 259 return 1; 260 261 // PTX ISA can only deal with 2- and 4-element vector ops. 262 if (NumElts != 4 && NumElts != 2) 263 return 1; 264 265 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 266 // Types do not match. 267 if (ValueVTs[j] != EltVT) 268 return 1; 269 270 // Elements are not contiguous. 271 if (Offsets[j] - Offsets[j - 1] != EltSize) 272 return 1; 273 } 274 // OK. We can vectorize ValueVTs[i..i+NumElts) 275 return NumElts; 276 } 277 278 // Flags for tracking per-element vectorization state of loads/stores 279 // of a flattened function parameter or return value. 280 enum ParamVectorizationFlags { 281 PVF_INNER = 0x0, // Middle elements of a vector. 282 PVF_FIRST = 0x1, // First element of the vector. 283 PVF_LAST = 0x2, // Last element of the vector. 284 // Scalar is effectively a 1-element vector. 285 PVF_SCALAR = PVF_FIRST | PVF_LAST 286 }; 287 288 // Computes whether and how we can vectorize the loads/stores of a 289 // flattened function parameter or return value. 290 // 291 // The flattened parameter is represented as the list of ValueVTs and 292 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 293 // of the same size as ValueVTs indicating how each piece should be 294 // loaded/stored (i.e. as a scalar, or as part of a vector 295 // load/store). 296 static SmallVector<ParamVectorizationFlags, 16> 297 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 298 const SmallVectorImpl<uint64_t> &Offsets, 299 unsigned ParamAlignment) { 300 // Set vector size to match ValueVTs and mark all elements as 301 // scalars by default. 302 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 303 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 304 305 // Check what we can vectorize using 128/64/32-bit accesses. 306 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 307 // Skip elements we've already processed. 308 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 309 for (unsigned AccessSize : {16, 8, 4, 2}) { 310 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 311 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 312 // Mark vectorized elements. 313 switch (NumElts) { 314 default: 315 llvm_unreachable("Unexpected return value"); 316 case 1: 317 // Can't vectorize using this size, try next smaller size. 318 continue; 319 case 2: 320 assert(I + 1 < E && "Not enough elements."); 321 VectorInfo[I] = PVF_FIRST; 322 VectorInfo[I + 1] = PVF_LAST; 323 I += 1; 324 break; 325 case 4: 326 assert(I + 3 < E && "Not enough elements."); 327 VectorInfo[I] = PVF_FIRST; 328 VectorInfo[I + 1] = PVF_INNER; 329 VectorInfo[I + 2] = PVF_INNER; 330 VectorInfo[I + 3] = PVF_LAST; 331 I += 3; 332 break; 333 } 334 // Break out of the inner loop because we've already succeeded 335 // using largest possible AccessSize. 336 break; 337 } 338 } 339 return VectorInfo; 340 } 341 342 // NVPTXTargetLowering Constructor. 343 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 344 const NVPTXSubtarget &STI) 345 : TargetLowering(TM), nvTM(&TM), STI(STI) { 346 // always lower memset, memcpy, and memmove intrinsics to load/store 347 // instructions, rather 348 // then generating calls to memset, mempcy or memmove. 349 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 350 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 351 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 352 353 setBooleanContents(ZeroOrNegativeOneBooleanContent); 354 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 355 356 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 357 // condition branches. 358 setJumpIsExpensive(true); 359 360 // Wide divides are _very_ slow. Try to reduce the width of the divide if 361 // possible. 362 addBypassSlowDiv(64, 32); 363 364 // By default, use the Source scheduling 365 if (sched4reg) 366 setSchedulingPreference(Sched::RegPressure); 367 else 368 setSchedulingPreference(Sched::Source); 369 370 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 371 LegalizeAction NoF16Action) { 372 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 373 }; 374 375 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 376 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 377 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 378 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 379 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 380 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 381 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 382 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 383 384 // Conversion to/from FP16/FP16x2 is always legal. 385 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 386 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 387 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 388 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 389 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 390 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 391 392 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 393 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 394 395 // Operations not directly supported by NVPTX. 396 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 397 MVT::i16, MVT::i32, MVT::i64}) { 398 setOperationAction(ISD::SELECT_CC, VT, Expand); 399 setOperationAction(ISD::BR_CC, VT, Expand); 400 } 401 402 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 403 // For others we will expand to a SHL/SRA pair. 404 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 405 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 406 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 407 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 408 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 409 410 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 411 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 412 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 413 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 414 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 415 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 416 417 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 418 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 419 420 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 421 // that don't have h/w rotation we lower them to multi-instruction assembly. 422 // See ROT*_sw in NVPTXIntrInfo.td 423 setOperationAction(ISD::ROTL, MVT::i64, Legal); 424 setOperationAction(ISD::ROTR, MVT::i64, Legal); 425 setOperationAction(ISD::ROTL, MVT::i32, Legal); 426 setOperationAction(ISD::ROTR, MVT::i32, Legal); 427 428 setOperationAction(ISD::ROTL, MVT::i16, Expand); 429 setOperationAction(ISD::ROTR, MVT::i16, Expand); 430 setOperationAction(ISD::ROTL, MVT::i8, Expand); 431 setOperationAction(ISD::ROTR, MVT::i8, Expand); 432 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 433 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 434 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 435 436 // Indirect branch is not supported. 437 // This also disables Jump Table creation. 438 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 439 setOperationAction(ISD::BRIND, MVT::Other, Expand); 440 441 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 442 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 443 444 // We want to legalize constant related memmove and memcopy 445 // intrinsics. 446 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 447 448 // Turn FP extload into load/fpextend 449 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 450 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 451 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 452 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 453 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 454 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 455 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 456 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 457 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 458 // Turn FP truncstore into trunc + store. 459 // FIXME: vector types should also be expanded 460 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 461 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 462 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 463 464 // PTX does not support load / store predicate registers 465 setOperationAction(ISD::LOAD, MVT::i1, Custom); 466 setOperationAction(ISD::STORE, MVT::i1, Custom); 467 468 for (MVT VT : MVT::integer_valuetypes()) { 469 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 470 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 471 setTruncStoreAction(VT, MVT::i1, Expand); 472 } 473 474 // This is legal in NVPTX 475 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 476 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 477 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 478 479 // TRAP can be lowered to PTX trap 480 setOperationAction(ISD::TRAP, MVT::Other, Legal); 481 482 // Register custom handling for vector loads/stores 483 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 484 if (IsPTXVectorType(VT)) { 485 setOperationAction(ISD::LOAD, VT, Custom); 486 setOperationAction(ISD::STORE, VT, Custom); 487 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 488 } 489 } 490 491 // Custom handling for i8 intrinsics 492 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 493 494 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 495 setOperationAction(ISD::ABS, Ty, Legal); 496 setOperationAction(ISD::SMIN, Ty, Legal); 497 setOperationAction(ISD::SMAX, Ty, Legal); 498 setOperationAction(ISD::UMIN, Ty, Legal); 499 setOperationAction(ISD::UMAX, Ty, Legal); 500 501 setOperationAction(ISD::CTPOP, Ty, Legal); 502 setOperationAction(ISD::CTLZ, Ty, Legal); 503 } 504 505 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 506 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 507 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 508 509 // PTX does not directly support SELP of i1, so promote to i32 first 510 setOperationAction(ISD::SELECT, MVT::i1, Custom); 511 512 // PTX cannot multiply two i64s in a single instruction. 513 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 514 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 515 516 // We have some custom DAG combine patterns for these nodes 517 setTargetDAGCombine(ISD::ADD); 518 setTargetDAGCombine(ISD::AND); 519 setTargetDAGCombine(ISD::FADD); 520 setTargetDAGCombine(ISD::MUL); 521 setTargetDAGCombine(ISD::SHL); 522 setTargetDAGCombine(ISD::SREM); 523 setTargetDAGCombine(ISD::UREM); 524 525 // setcc for f16x2 needs special handling to prevent legalizer's 526 // attempt to scalarize it due to v2i1 not being legal. 527 if (STI.allowFP16Math()) 528 setTargetDAGCombine(ISD::SETCC); 529 530 // Promote fp16 arithmetic if fp16 hardware isn't available or the 531 // user passed --nvptx-no-fp16-math. The flag is useful because, 532 // although sm_53+ GPUs have some sort of FP16 support in 533 // hardware, only sm_53 and sm_60 have full implementation. Others 534 // only have token amount of hardware and are likely to run faster 535 // by using fp32 units instead. 536 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 537 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 538 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 539 } 540 541 // There's no neg.f16 instruction. Expand to (0-x). 542 setOperationAction(ISD::FNEG, MVT::f16, Expand); 543 setOperationAction(ISD::FNEG, MVT::v2f16, Expand); 544 545 // (would be) Library functions. 546 547 // These map to conversion instructions for scalar FP types. 548 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 549 ISD::FTRUNC}) { 550 setOperationAction(Op, MVT::f16, Legal); 551 setOperationAction(Op, MVT::f32, Legal); 552 setOperationAction(Op, MVT::f64, Legal); 553 setOperationAction(Op, MVT::v2f16, Expand); 554 } 555 556 setOperationAction(ISD::FROUND, MVT::f16, Promote); 557 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 558 setOperationAction(ISD::FROUND, MVT::f32, Custom); 559 setOperationAction(ISD::FROUND, MVT::f64, Custom); 560 561 562 // 'Expand' implements FCOPYSIGN without calling an external library. 563 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 564 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 565 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 566 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 567 568 // These map to corresponding instructions for f32/f64. f16 must be 569 // promoted to f32. v2f16 is expanded to f16, which is then promoted 570 // to f32. 571 for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, 572 ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { 573 setOperationAction(Op, MVT::f16, Promote); 574 setOperationAction(Op, MVT::f32, Legal); 575 setOperationAction(Op, MVT::f64, Legal); 576 setOperationAction(Op, MVT::v2f16, Expand); 577 } 578 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 579 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 580 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 581 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 582 583 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 584 // No FPOW or FREM in PTX. 585 586 // Now deduce the information based on the above mentioned 587 // actions 588 computeRegisterProperties(STI.getRegisterInfo()); 589 } 590 591 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 592 switch ((NVPTXISD::NodeType)Opcode) { 593 case NVPTXISD::FIRST_NUMBER: 594 break; 595 case NVPTXISD::CALL: 596 return "NVPTXISD::CALL"; 597 case NVPTXISD::RET_FLAG: 598 return "NVPTXISD::RET_FLAG"; 599 case NVPTXISD::LOAD_PARAM: 600 return "NVPTXISD::LOAD_PARAM"; 601 case NVPTXISD::Wrapper: 602 return "NVPTXISD::Wrapper"; 603 case NVPTXISD::DeclareParam: 604 return "NVPTXISD::DeclareParam"; 605 case NVPTXISD::DeclareScalarParam: 606 return "NVPTXISD::DeclareScalarParam"; 607 case NVPTXISD::DeclareRet: 608 return "NVPTXISD::DeclareRet"; 609 case NVPTXISD::DeclareScalarRet: 610 return "NVPTXISD::DeclareScalarRet"; 611 case NVPTXISD::DeclareRetParam: 612 return "NVPTXISD::DeclareRetParam"; 613 case NVPTXISD::PrintCall: 614 return "NVPTXISD::PrintCall"; 615 case NVPTXISD::PrintConvergentCall: 616 return "NVPTXISD::PrintConvergentCall"; 617 case NVPTXISD::PrintCallUni: 618 return "NVPTXISD::PrintCallUni"; 619 case NVPTXISD::PrintConvergentCallUni: 620 return "NVPTXISD::PrintConvergentCallUni"; 621 case NVPTXISD::LoadParam: 622 return "NVPTXISD::LoadParam"; 623 case NVPTXISD::LoadParamV2: 624 return "NVPTXISD::LoadParamV2"; 625 case NVPTXISD::LoadParamV4: 626 return "NVPTXISD::LoadParamV4"; 627 case NVPTXISD::StoreParam: 628 return "NVPTXISD::StoreParam"; 629 case NVPTXISD::StoreParamV2: 630 return "NVPTXISD::StoreParamV2"; 631 case NVPTXISD::StoreParamV4: 632 return "NVPTXISD::StoreParamV4"; 633 case NVPTXISD::StoreParamS32: 634 return "NVPTXISD::StoreParamS32"; 635 case NVPTXISD::StoreParamU32: 636 return "NVPTXISD::StoreParamU32"; 637 case NVPTXISD::CallArgBegin: 638 return "NVPTXISD::CallArgBegin"; 639 case NVPTXISD::CallArg: 640 return "NVPTXISD::CallArg"; 641 case NVPTXISD::LastCallArg: 642 return "NVPTXISD::LastCallArg"; 643 case NVPTXISD::CallArgEnd: 644 return "NVPTXISD::CallArgEnd"; 645 case NVPTXISD::CallVoid: 646 return "NVPTXISD::CallVoid"; 647 case NVPTXISD::CallVal: 648 return "NVPTXISD::CallVal"; 649 case NVPTXISD::CallSymbol: 650 return "NVPTXISD::CallSymbol"; 651 case NVPTXISD::Prototype: 652 return "NVPTXISD::Prototype"; 653 case NVPTXISD::MoveParam: 654 return "NVPTXISD::MoveParam"; 655 case NVPTXISD::StoreRetval: 656 return "NVPTXISD::StoreRetval"; 657 case NVPTXISD::StoreRetvalV2: 658 return "NVPTXISD::StoreRetvalV2"; 659 case NVPTXISD::StoreRetvalV4: 660 return "NVPTXISD::StoreRetvalV4"; 661 case NVPTXISD::PseudoUseParam: 662 return "NVPTXISD::PseudoUseParam"; 663 case NVPTXISD::RETURN: 664 return "NVPTXISD::RETURN"; 665 case NVPTXISD::CallSeqBegin: 666 return "NVPTXISD::CallSeqBegin"; 667 case NVPTXISD::CallSeqEnd: 668 return "NVPTXISD::CallSeqEnd"; 669 case NVPTXISD::CallPrototype: 670 return "NVPTXISD::CallPrototype"; 671 case NVPTXISD::ProxyReg: 672 return "NVPTXISD::ProxyReg"; 673 case NVPTXISD::LoadV2: 674 return "NVPTXISD::LoadV2"; 675 case NVPTXISD::LoadV4: 676 return "NVPTXISD::LoadV4"; 677 case NVPTXISD::LDGV2: 678 return "NVPTXISD::LDGV2"; 679 case NVPTXISD::LDGV4: 680 return "NVPTXISD::LDGV4"; 681 case NVPTXISD::LDUV2: 682 return "NVPTXISD::LDUV2"; 683 case NVPTXISD::LDUV4: 684 return "NVPTXISD::LDUV4"; 685 case NVPTXISD::StoreV2: 686 return "NVPTXISD::StoreV2"; 687 case NVPTXISD::StoreV4: 688 return "NVPTXISD::StoreV4"; 689 case NVPTXISD::FUN_SHFL_CLAMP: 690 return "NVPTXISD::FUN_SHFL_CLAMP"; 691 case NVPTXISD::FUN_SHFR_CLAMP: 692 return "NVPTXISD::FUN_SHFR_CLAMP"; 693 case NVPTXISD::IMAD: 694 return "NVPTXISD::IMAD"; 695 case NVPTXISD::SETP_F16X2: 696 return "NVPTXISD::SETP_F16X2"; 697 case NVPTXISD::Dummy: 698 return "NVPTXISD::Dummy"; 699 case NVPTXISD::MUL_WIDE_SIGNED: 700 return "NVPTXISD::MUL_WIDE_SIGNED"; 701 case NVPTXISD::MUL_WIDE_UNSIGNED: 702 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 703 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 704 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 705 case NVPTXISD::Tex1DFloatFloatLevel: 706 return "NVPTXISD::Tex1DFloatFloatLevel"; 707 case NVPTXISD::Tex1DFloatFloatGrad: 708 return "NVPTXISD::Tex1DFloatFloatGrad"; 709 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 710 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 711 case NVPTXISD::Tex1DS32FloatLevel: 712 return "NVPTXISD::Tex1DS32FloatLevel"; 713 case NVPTXISD::Tex1DS32FloatGrad: 714 return "NVPTXISD::Tex1DS32FloatGrad"; 715 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 716 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 717 case NVPTXISD::Tex1DU32FloatLevel: 718 return "NVPTXISD::Tex1DU32FloatLevel"; 719 case NVPTXISD::Tex1DU32FloatGrad: 720 return "NVPTXISD::Tex1DU32FloatGrad"; 721 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 722 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 723 case NVPTXISD::Tex1DArrayFloatFloatLevel: 724 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 725 case NVPTXISD::Tex1DArrayFloatFloatGrad: 726 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 727 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 728 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 729 case NVPTXISD::Tex1DArrayS32FloatLevel: 730 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 731 case NVPTXISD::Tex1DArrayS32FloatGrad: 732 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 733 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 734 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 735 case NVPTXISD::Tex1DArrayU32FloatLevel: 736 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 737 case NVPTXISD::Tex1DArrayU32FloatGrad: 738 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 739 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 740 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 741 case NVPTXISD::Tex2DFloatFloatLevel: 742 return "NVPTXISD::Tex2DFloatFloatLevel"; 743 case NVPTXISD::Tex2DFloatFloatGrad: 744 return "NVPTXISD::Tex2DFloatFloatGrad"; 745 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 746 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 747 case NVPTXISD::Tex2DS32FloatLevel: 748 return "NVPTXISD::Tex2DS32FloatLevel"; 749 case NVPTXISD::Tex2DS32FloatGrad: 750 return "NVPTXISD::Tex2DS32FloatGrad"; 751 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 752 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 753 case NVPTXISD::Tex2DU32FloatLevel: 754 return "NVPTXISD::Tex2DU32FloatLevel"; 755 case NVPTXISD::Tex2DU32FloatGrad: 756 return "NVPTXISD::Tex2DU32FloatGrad"; 757 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 758 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 759 case NVPTXISD::Tex2DArrayFloatFloatLevel: 760 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 761 case NVPTXISD::Tex2DArrayFloatFloatGrad: 762 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 763 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 764 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 765 case NVPTXISD::Tex2DArrayS32FloatLevel: 766 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 767 case NVPTXISD::Tex2DArrayS32FloatGrad: 768 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 769 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 770 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 771 case NVPTXISD::Tex2DArrayU32FloatLevel: 772 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 773 case NVPTXISD::Tex2DArrayU32FloatGrad: 774 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 775 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 776 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 777 case NVPTXISD::Tex3DFloatFloatLevel: 778 return "NVPTXISD::Tex3DFloatFloatLevel"; 779 case NVPTXISD::Tex3DFloatFloatGrad: 780 return "NVPTXISD::Tex3DFloatFloatGrad"; 781 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 782 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 783 case NVPTXISD::Tex3DS32FloatLevel: 784 return "NVPTXISD::Tex3DS32FloatLevel"; 785 case NVPTXISD::Tex3DS32FloatGrad: 786 return "NVPTXISD::Tex3DS32FloatGrad"; 787 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 788 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 789 case NVPTXISD::Tex3DU32FloatLevel: 790 return "NVPTXISD::Tex3DU32FloatLevel"; 791 case NVPTXISD::Tex3DU32FloatGrad: 792 return "NVPTXISD::Tex3DU32FloatGrad"; 793 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 794 case NVPTXISD::TexCubeFloatFloatLevel: 795 return "NVPTXISD::TexCubeFloatFloatLevel"; 796 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 797 case NVPTXISD::TexCubeS32FloatLevel: 798 return "NVPTXISD::TexCubeS32FloatLevel"; 799 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 800 case NVPTXISD::TexCubeU32FloatLevel: 801 return "NVPTXISD::TexCubeU32FloatLevel"; 802 case NVPTXISD::TexCubeArrayFloatFloat: 803 return "NVPTXISD::TexCubeArrayFloatFloat"; 804 case NVPTXISD::TexCubeArrayFloatFloatLevel: 805 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 806 case NVPTXISD::TexCubeArrayS32Float: 807 return "NVPTXISD::TexCubeArrayS32Float"; 808 case NVPTXISD::TexCubeArrayS32FloatLevel: 809 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 810 case NVPTXISD::TexCubeArrayU32Float: 811 return "NVPTXISD::TexCubeArrayU32Float"; 812 case NVPTXISD::TexCubeArrayU32FloatLevel: 813 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 814 case NVPTXISD::Tld4R2DFloatFloat: 815 return "NVPTXISD::Tld4R2DFloatFloat"; 816 case NVPTXISD::Tld4G2DFloatFloat: 817 return "NVPTXISD::Tld4G2DFloatFloat"; 818 case NVPTXISD::Tld4B2DFloatFloat: 819 return "NVPTXISD::Tld4B2DFloatFloat"; 820 case NVPTXISD::Tld4A2DFloatFloat: 821 return "NVPTXISD::Tld4A2DFloatFloat"; 822 case NVPTXISD::Tld4R2DS64Float: 823 return "NVPTXISD::Tld4R2DS64Float"; 824 case NVPTXISD::Tld4G2DS64Float: 825 return "NVPTXISD::Tld4G2DS64Float"; 826 case NVPTXISD::Tld4B2DS64Float: 827 return "NVPTXISD::Tld4B2DS64Float"; 828 case NVPTXISD::Tld4A2DS64Float: 829 return "NVPTXISD::Tld4A2DS64Float"; 830 case NVPTXISD::Tld4R2DU64Float: 831 return "NVPTXISD::Tld4R2DU64Float"; 832 case NVPTXISD::Tld4G2DU64Float: 833 return "NVPTXISD::Tld4G2DU64Float"; 834 case NVPTXISD::Tld4B2DU64Float: 835 return "NVPTXISD::Tld4B2DU64Float"; 836 case NVPTXISD::Tld4A2DU64Float: 837 return "NVPTXISD::Tld4A2DU64Float"; 838 839 case NVPTXISD::TexUnified1DFloatS32: 840 return "NVPTXISD::TexUnified1DFloatS32"; 841 case NVPTXISD::TexUnified1DFloatFloat: 842 return "NVPTXISD::TexUnified1DFloatFloat"; 843 case NVPTXISD::TexUnified1DFloatFloatLevel: 844 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 845 case NVPTXISD::TexUnified1DFloatFloatGrad: 846 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 847 case NVPTXISD::TexUnified1DS32S32: 848 return "NVPTXISD::TexUnified1DS32S32"; 849 case NVPTXISD::TexUnified1DS32Float: 850 return "NVPTXISD::TexUnified1DS32Float"; 851 case NVPTXISD::TexUnified1DS32FloatLevel: 852 return "NVPTXISD::TexUnified1DS32FloatLevel"; 853 case NVPTXISD::TexUnified1DS32FloatGrad: 854 return "NVPTXISD::TexUnified1DS32FloatGrad"; 855 case NVPTXISD::TexUnified1DU32S32: 856 return "NVPTXISD::TexUnified1DU32S32"; 857 case NVPTXISD::TexUnified1DU32Float: 858 return "NVPTXISD::TexUnified1DU32Float"; 859 case NVPTXISD::TexUnified1DU32FloatLevel: 860 return "NVPTXISD::TexUnified1DU32FloatLevel"; 861 case NVPTXISD::TexUnified1DU32FloatGrad: 862 return "NVPTXISD::TexUnified1DU32FloatGrad"; 863 case NVPTXISD::TexUnified1DArrayFloatS32: 864 return "NVPTXISD::TexUnified1DArrayFloatS32"; 865 case NVPTXISD::TexUnified1DArrayFloatFloat: 866 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 867 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 868 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 869 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 870 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 871 case NVPTXISD::TexUnified1DArrayS32S32: 872 return "NVPTXISD::TexUnified1DArrayS32S32"; 873 case NVPTXISD::TexUnified1DArrayS32Float: 874 return "NVPTXISD::TexUnified1DArrayS32Float"; 875 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 876 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 877 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 878 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 879 case NVPTXISD::TexUnified1DArrayU32S32: 880 return "NVPTXISD::TexUnified1DArrayU32S32"; 881 case NVPTXISD::TexUnified1DArrayU32Float: 882 return "NVPTXISD::TexUnified1DArrayU32Float"; 883 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 884 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 885 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 886 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 887 case NVPTXISD::TexUnified2DFloatS32: 888 return "NVPTXISD::TexUnified2DFloatS32"; 889 case NVPTXISD::TexUnified2DFloatFloat: 890 return "NVPTXISD::TexUnified2DFloatFloat"; 891 case NVPTXISD::TexUnified2DFloatFloatLevel: 892 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 893 case NVPTXISD::TexUnified2DFloatFloatGrad: 894 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 895 case NVPTXISD::TexUnified2DS32S32: 896 return "NVPTXISD::TexUnified2DS32S32"; 897 case NVPTXISD::TexUnified2DS32Float: 898 return "NVPTXISD::TexUnified2DS32Float"; 899 case NVPTXISD::TexUnified2DS32FloatLevel: 900 return "NVPTXISD::TexUnified2DS32FloatLevel"; 901 case NVPTXISD::TexUnified2DS32FloatGrad: 902 return "NVPTXISD::TexUnified2DS32FloatGrad"; 903 case NVPTXISD::TexUnified2DU32S32: 904 return "NVPTXISD::TexUnified2DU32S32"; 905 case NVPTXISD::TexUnified2DU32Float: 906 return "NVPTXISD::TexUnified2DU32Float"; 907 case NVPTXISD::TexUnified2DU32FloatLevel: 908 return "NVPTXISD::TexUnified2DU32FloatLevel"; 909 case NVPTXISD::TexUnified2DU32FloatGrad: 910 return "NVPTXISD::TexUnified2DU32FloatGrad"; 911 case NVPTXISD::TexUnified2DArrayFloatS32: 912 return "NVPTXISD::TexUnified2DArrayFloatS32"; 913 case NVPTXISD::TexUnified2DArrayFloatFloat: 914 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 915 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 916 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 917 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 918 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 919 case NVPTXISD::TexUnified2DArrayS32S32: 920 return "NVPTXISD::TexUnified2DArrayS32S32"; 921 case NVPTXISD::TexUnified2DArrayS32Float: 922 return "NVPTXISD::TexUnified2DArrayS32Float"; 923 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 924 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 925 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 926 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 927 case NVPTXISD::TexUnified2DArrayU32S32: 928 return "NVPTXISD::TexUnified2DArrayU32S32"; 929 case NVPTXISD::TexUnified2DArrayU32Float: 930 return "NVPTXISD::TexUnified2DArrayU32Float"; 931 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 932 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 933 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 934 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 935 case NVPTXISD::TexUnified3DFloatS32: 936 return "NVPTXISD::TexUnified3DFloatS32"; 937 case NVPTXISD::TexUnified3DFloatFloat: 938 return "NVPTXISD::TexUnified3DFloatFloat"; 939 case NVPTXISD::TexUnified3DFloatFloatLevel: 940 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 941 case NVPTXISD::TexUnified3DFloatFloatGrad: 942 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 943 case NVPTXISD::TexUnified3DS32S32: 944 return "NVPTXISD::TexUnified3DS32S32"; 945 case NVPTXISD::TexUnified3DS32Float: 946 return "NVPTXISD::TexUnified3DS32Float"; 947 case NVPTXISD::TexUnified3DS32FloatLevel: 948 return "NVPTXISD::TexUnified3DS32FloatLevel"; 949 case NVPTXISD::TexUnified3DS32FloatGrad: 950 return "NVPTXISD::TexUnified3DS32FloatGrad"; 951 case NVPTXISD::TexUnified3DU32S32: 952 return "NVPTXISD::TexUnified3DU32S32"; 953 case NVPTXISD::TexUnified3DU32Float: 954 return "NVPTXISD::TexUnified3DU32Float"; 955 case NVPTXISD::TexUnified3DU32FloatLevel: 956 return "NVPTXISD::TexUnified3DU32FloatLevel"; 957 case NVPTXISD::TexUnified3DU32FloatGrad: 958 return "NVPTXISD::TexUnified3DU32FloatGrad"; 959 case NVPTXISD::TexUnifiedCubeFloatFloat: 960 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 961 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 962 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 963 case NVPTXISD::TexUnifiedCubeS32Float: 964 return "NVPTXISD::TexUnifiedCubeS32Float"; 965 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 966 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 967 case NVPTXISD::TexUnifiedCubeU32Float: 968 return "NVPTXISD::TexUnifiedCubeU32Float"; 969 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 970 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 971 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 972 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 973 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 974 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 975 case NVPTXISD::TexUnifiedCubeArrayS32Float: 976 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 977 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 978 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 979 case NVPTXISD::TexUnifiedCubeArrayU32Float: 980 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 981 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 982 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 983 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 984 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 985 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 986 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 987 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 988 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 989 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 990 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 991 case NVPTXISD::Tld4UnifiedR2DS64Float: 992 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 993 case NVPTXISD::Tld4UnifiedG2DS64Float: 994 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 995 case NVPTXISD::Tld4UnifiedB2DS64Float: 996 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 997 case NVPTXISD::Tld4UnifiedA2DS64Float: 998 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 999 case NVPTXISD::Tld4UnifiedR2DU64Float: 1000 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1001 case NVPTXISD::Tld4UnifiedG2DU64Float: 1002 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1003 case NVPTXISD::Tld4UnifiedB2DU64Float: 1004 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1005 case NVPTXISD::Tld4UnifiedA2DU64Float: 1006 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1007 1008 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1009 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1010 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1011 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1012 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1013 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1014 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1015 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1016 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1017 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1018 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1019 1020 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1021 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1022 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1023 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1024 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1025 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1026 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1027 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1028 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1029 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1030 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1031 1032 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1033 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1034 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1035 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1036 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1037 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1038 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1039 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1040 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1041 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1042 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1043 1044 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1045 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1046 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1047 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1048 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1049 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1050 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1051 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1052 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1053 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1054 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1055 1056 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1057 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1058 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1059 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1060 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1061 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1062 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1063 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1064 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1065 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1066 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1067 1068 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1069 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1070 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1071 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1072 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1073 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1074 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1075 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1076 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1077 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1078 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1079 1080 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1081 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1082 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1083 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1084 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1085 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1086 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1087 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1088 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1089 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1090 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1091 1092 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1093 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1094 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1095 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1096 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1097 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1098 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1099 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1100 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1101 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1102 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1103 1104 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1105 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1106 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1107 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1108 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1109 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1110 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1111 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1112 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1113 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1114 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1115 1116 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1117 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1118 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1119 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1120 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1121 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1122 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1123 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1124 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1125 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1126 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1127 1128 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1129 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1130 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1131 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1132 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1133 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1134 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1135 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1136 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1137 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1138 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1139 1140 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1141 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1142 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1143 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1144 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1145 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1146 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1147 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1148 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1149 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1150 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1151 1152 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1153 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1154 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1155 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1156 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1157 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1158 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1159 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1160 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1161 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1162 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1163 1164 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1165 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1166 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1167 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1168 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1169 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1170 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1171 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1172 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1173 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1174 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1175 1176 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1177 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1178 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1179 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1180 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1181 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1182 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1183 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1184 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1185 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1186 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1187 } 1188 return nullptr; 1189 } 1190 1191 TargetLoweringBase::LegalizeTypeAction 1192 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1193 if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) 1194 return TypeSplitVector; 1195 if (VT == MVT::v2f16) 1196 return TypeLegal; 1197 return TargetLoweringBase::getPreferredVectorAction(VT); 1198 } 1199 1200 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1201 int Enabled, int &ExtraSteps, 1202 bool &UseOneConst, 1203 bool Reciprocal) const { 1204 if (!(Enabled == ReciprocalEstimate::Enabled || 1205 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1206 return SDValue(); 1207 1208 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1209 ExtraSteps = 0; 1210 1211 SDLoc DL(Operand); 1212 EVT VT = Operand.getValueType(); 1213 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1214 1215 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1216 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1217 DAG.getConstant(IID, DL, MVT::i32), Operand); 1218 }; 1219 1220 // The sqrt and rsqrt refinement processes assume we always start out with an 1221 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1222 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1223 // any refinement, we must return a regular sqrt. 1224 if (Reciprocal || ExtraSteps > 0) { 1225 if (VT == MVT::f32) 1226 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1227 : Intrinsic::nvvm_rsqrt_approx_f); 1228 else if (VT == MVT::f64) 1229 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1230 else 1231 return SDValue(); 1232 } else { 1233 if (VT == MVT::f32) 1234 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1235 : Intrinsic::nvvm_sqrt_approx_f); 1236 else { 1237 // There's no sqrt.approx.f64 instruction, so we emit 1238 // reciprocal(rsqrt(x)). This is faster than 1239 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1240 // x * rsqrt(x).) 1241 return DAG.getNode( 1242 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1243 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1244 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1245 } 1246 } 1247 } 1248 1249 SDValue 1250 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1251 SDLoc dl(Op); 1252 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1253 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1254 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1255 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1256 } 1257 1258 std::string NVPTXTargetLowering::getPrototype( 1259 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1260 const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment, 1261 ImmutableCallSite CS) const { 1262 auto PtrVT = getPointerTy(DL); 1263 1264 bool isABI = (STI.getSmVersion() >= 20); 1265 assert(isABI && "Non-ABI compilation is not supported"); 1266 if (!isABI) 1267 return ""; 1268 1269 std::stringstream O; 1270 O << "prototype_" << uniqueCallSite << " : .callprototype "; 1271 1272 if (retTy->getTypeID() == Type::VoidTyID) { 1273 O << "()"; 1274 } else { 1275 O << "("; 1276 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1277 unsigned size = 0; 1278 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1279 size = ITy->getBitWidth(); 1280 } else { 1281 assert(retTy->isFloatingPointTy() && 1282 "Floating point type expected here"); 1283 size = retTy->getPrimitiveSizeInBits(); 1284 } 1285 // PTX ABI requires all scalar return values to be at least 32 1286 // bits in size. fp16 normally uses .b16 as its storage type in 1287 // PTX, so its size must be adjusted here, too. 1288 if (size < 32) 1289 size = 32; 1290 1291 O << ".param .b" << size << " _"; 1292 } else if (isa<PointerType>(retTy)) { 1293 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1294 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1295 retTy->isIntegerTy(128)) { 1296 O << ".param .align " << retAlignment << " .b8 _[" 1297 << DL.getTypeAllocSize(retTy) << "]"; 1298 } else { 1299 llvm_unreachable("Unknown return type"); 1300 } 1301 O << ") "; 1302 } 1303 O << "_ ("; 1304 1305 bool first = true; 1306 1307 unsigned OIdx = 0; 1308 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1309 Type *Ty = Args[i].Ty; 1310 if (!first) { 1311 O << ", "; 1312 } 1313 first = false; 1314 1315 if (!Outs[OIdx].Flags.isByVal()) { 1316 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1317 unsigned align = 0; 1318 const CallInst *CallI = cast<CallInst>(CS.getInstruction()); 1319 // +1 because index 0 is reserved for return type alignment 1320 if (!getAlign(*CallI, i + 1, align)) 1321 align = DL.getABITypeAlignment(Ty); 1322 unsigned sz = DL.getTypeAllocSize(Ty); 1323 O << ".param .align " << align << " .b8 "; 1324 O << "_"; 1325 O << "[" << sz << "]"; 1326 // update the index for Outs 1327 SmallVector<EVT, 16> vtparts; 1328 ComputeValueVTs(*this, DL, Ty, vtparts); 1329 if (unsigned len = vtparts.size()) 1330 OIdx += len - 1; 1331 continue; 1332 } 1333 // i8 types in IR will be i16 types in SDAG 1334 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1335 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1336 "type mismatch between callee prototype and arguments"); 1337 // scalar type 1338 unsigned sz = 0; 1339 if (isa<IntegerType>(Ty)) { 1340 sz = cast<IntegerType>(Ty)->getBitWidth(); 1341 if (sz < 32) 1342 sz = 32; 1343 } else if (isa<PointerType>(Ty)) { 1344 sz = PtrVT.getSizeInBits(); 1345 } else if (Ty->isHalfTy()) 1346 // PTX ABI requires all scalar parameters to be at least 32 1347 // bits in size. fp16 normally uses .b16 as its storage type 1348 // in PTX, so its size must be adjusted here, too. 1349 sz = 32; 1350 else 1351 sz = Ty->getPrimitiveSizeInBits(); 1352 O << ".param .b" << sz << " "; 1353 O << "_"; 1354 continue; 1355 } 1356 auto *PTy = dyn_cast<PointerType>(Ty); 1357 assert(PTy && "Param with byval attribute should be a pointer type"); 1358 Type *ETy = PTy->getElementType(); 1359 1360 unsigned align = Outs[OIdx].Flags.getByValAlign(); 1361 unsigned sz = DL.getTypeAllocSize(ETy); 1362 O << ".param .align " << align << " .b8 "; 1363 O << "_"; 1364 O << "[" << sz << "]"; 1365 } 1366 O << ");"; 1367 return O.str(); 1368 } 1369 1370 unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1371 ImmutableCallSite CS, 1372 Type *Ty, unsigned Idx, 1373 const DataLayout &DL) const { 1374 if (!CS) { 1375 // CallSite is zero, fallback to ABI type alignment 1376 return DL.getABITypeAlignment(Ty); 1377 } 1378 1379 unsigned Align = 0; 1380 const Value *DirectCallee = CS.getCalledFunction(); 1381 1382 if (!DirectCallee) { 1383 // We don't have a direct function symbol, but that may be because of 1384 // constant cast instructions in the call. 1385 const Instruction *CalleeI = CS.getInstruction(); 1386 assert(CalleeI && "Call target is not a function or derived value?"); 1387 1388 // With bitcast'd call targets, the instruction will be the call 1389 if (isa<CallInst>(CalleeI)) { 1390 // Check if we have call alignment metadata 1391 if (getAlign(*cast<CallInst>(CalleeI), Idx, Align)) 1392 return Align; 1393 1394 const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue(); 1395 // Ignore any bitcast instructions 1396 while (isa<ConstantExpr>(CalleeV)) { 1397 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 1398 if (!CE->isCast()) 1399 break; 1400 // Look through the bitcast 1401 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 1402 } 1403 1404 // We have now looked past all of the bitcasts. Do we finally have a 1405 // Function? 1406 if (isa<Function>(CalleeV)) 1407 DirectCallee = CalleeV; 1408 } 1409 } 1410 1411 // Check for function alignment information if we found that the 1412 // ultimate target is a Function 1413 if (DirectCallee) 1414 if (getAlign(*cast<Function>(DirectCallee), Idx, Align)) 1415 return Align; 1416 1417 // Call is indirect or alignment information is not available, fall back to 1418 // the ABI type alignment 1419 return DL.getABITypeAlignment(Ty); 1420 } 1421 1422 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1423 SmallVectorImpl<SDValue> &InVals) const { 1424 SelectionDAG &DAG = CLI.DAG; 1425 SDLoc dl = CLI.DL; 1426 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1427 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1428 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1429 SDValue Chain = CLI.Chain; 1430 SDValue Callee = CLI.Callee; 1431 bool &isTailCall = CLI.IsTailCall; 1432 ArgListTy &Args = CLI.getArgs(); 1433 Type *RetTy = CLI.RetTy; 1434 ImmutableCallSite CS = CLI.CS; 1435 const DataLayout &DL = DAG.getDataLayout(); 1436 1437 bool isABI = (STI.getSmVersion() >= 20); 1438 assert(isABI && "Non-ABI compilation is not supported"); 1439 if (!isABI) 1440 return Chain; 1441 1442 SDValue tempChain = Chain; 1443 Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); 1444 SDValue InFlag = Chain.getValue(1); 1445 1446 unsigned paramCount = 0; 1447 // Args.size() and Outs.size() need not match. 1448 // Outs.size() will be larger 1449 // * if there is an aggregate argument with multiple fields (each field 1450 // showing up separately in Outs) 1451 // * if there is a vector argument with more than typical vector-length 1452 // elements (generally if more than 4) where each vector element is 1453 // individually present in Outs. 1454 // So a different index should be used for indexing into Outs/OutVals. 1455 // See similar issue in LowerFormalArguments. 1456 unsigned OIdx = 0; 1457 // Declare the .params or .reg need to pass values 1458 // to the function 1459 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1460 EVT VT = Outs[OIdx].VT; 1461 Type *Ty = Args[i].Ty; 1462 1463 if (!Outs[OIdx].Flags.isByVal()) { 1464 SmallVector<EVT, 16> VTs; 1465 SmallVector<uint64_t, 16> Offsets; 1466 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); 1467 unsigned ArgAlign = 1468 getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL); 1469 unsigned AllocSize = DL.getTypeAllocSize(Ty); 1470 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1471 bool NeedAlign; // Does argument declaration specify alignment? 1472 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1473 // declare .param .align <align> .b8 .param<n>[<size>]; 1474 SDValue DeclareParamOps[] = { 1475 Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), 1476 DAG.getConstant(paramCount, dl, MVT::i32), 1477 DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; 1478 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1479 DeclareParamOps); 1480 NeedAlign = true; 1481 } else { 1482 // declare .param .b<size> .param<n>; 1483 if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { 1484 // PTX ABI requires integral types to be at least 32 bits in 1485 // size. FP16 is loaded/stored using i16, so it's handled 1486 // here as well. 1487 AllocSize = 4; 1488 } 1489 SDValue DeclareScalarParamOps[] = { 1490 Chain, DAG.getConstant(paramCount, dl, MVT::i32), 1491 DAG.getConstant(AllocSize * 8, dl, MVT::i32), 1492 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1493 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1494 DeclareScalarParamOps); 1495 NeedAlign = false; 1496 } 1497 InFlag = Chain.getValue(1); 1498 1499 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1500 // than 32-bits are sign extended or zero extended, depending on 1501 // whether they are signed or unsigned types. This case applies 1502 // only to scalar parameters and not to aggregate values. 1503 bool ExtendIntegerParam = 1504 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1505 1506 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 1507 SmallVector<SDValue, 6> StoreOperands; 1508 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1509 // New store. 1510 if (VectorInfo[j] & PVF_FIRST) { 1511 assert(StoreOperands.empty() && "Unfinished preceding store."); 1512 StoreOperands.push_back(Chain); 1513 StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); 1514 StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); 1515 } 1516 1517 EVT EltVT = VTs[j]; 1518 SDValue StVal = OutVals[OIdx]; 1519 if (ExtendIntegerParam) { 1520 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1521 // zext/sext to i32 1522 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1523 : ISD::ZERO_EXTEND, 1524 dl, MVT::i32, StVal); 1525 } else if (EltVT.getSizeInBits() < 16) { 1526 // Use 16-bit registers for small stores as it's the 1527 // smallest general purpose register size supported by NVPTX. 1528 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1529 } 1530 1531 // Record the value to store. 1532 StoreOperands.push_back(StVal); 1533 1534 if (VectorInfo[j] & PVF_LAST) { 1535 unsigned NumElts = StoreOperands.size() - 3; 1536 NVPTXISD::NodeType Op; 1537 switch (NumElts) { 1538 case 1: 1539 Op = NVPTXISD::StoreParam; 1540 break; 1541 case 2: 1542 Op = NVPTXISD::StoreParamV2; 1543 break; 1544 case 4: 1545 Op = NVPTXISD::StoreParamV4; 1546 break; 1547 default: 1548 llvm_unreachable("Invalid vector info."); 1549 } 1550 1551 StoreOperands.push_back(InFlag); 1552 1553 // Adjust type of the store op if we've extended the scalar 1554 // return value. 1555 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; 1556 unsigned EltAlign = 1557 NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0; 1558 1559 Chain = DAG.getMemIntrinsicNode( 1560 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1561 TheStoreType, MachinePointerInfo(), EltAlign, 1562 MachineMemOperand::MOStore); 1563 InFlag = Chain.getValue(1); 1564 1565 // Cleanup. 1566 StoreOperands.clear(); 1567 } 1568 ++OIdx; 1569 } 1570 assert(StoreOperands.empty() && "Unfinished parameter store."); 1571 if (VTs.size() > 0) 1572 --OIdx; 1573 ++paramCount; 1574 continue; 1575 } 1576 1577 // ByVal arguments 1578 SmallVector<EVT, 16> VTs; 1579 SmallVector<uint64_t, 16> Offsets; 1580 auto *PTy = dyn_cast<PointerType>(Args[i].Ty); 1581 assert(PTy && "Type of a byval parameter should be pointer"); 1582 ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); 1583 1584 // declare .param .align <align> .b8 .param<n>[<size>]; 1585 unsigned sz = Outs[OIdx].Flags.getByValSize(); 1586 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1587 unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign(); 1588 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, 1589 // so we don't need to worry about natural alignment or not. 1590 // See TargetLowering::LowerCallTo(). 1591 1592 // Enforce minumum alignment of 4 to work around ptxas miscompile 1593 // for sm_50+. See corresponding alignment adjustment in 1594 // emitFunctionParamList() for details. 1595 if (ArgAlign < 4) 1596 ArgAlign = 4; 1597 SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32), 1598 DAG.getConstant(paramCount, dl, MVT::i32), 1599 DAG.getConstant(sz, dl, MVT::i32), InFlag}; 1600 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1601 DeclareParamOps); 1602 InFlag = Chain.getValue(1); 1603 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1604 EVT elemtype = VTs[j]; 1605 int curOffset = Offsets[j]; 1606 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset); 1607 auto PtrVT = getPointerTy(DL); 1608 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], 1609 DAG.getConstant(curOffset, dl, PtrVT)); 1610 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, 1611 MachinePointerInfo(), PartAlign); 1612 if (elemtype.getSizeInBits() < 16) { 1613 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); 1614 } 1615 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1616 SDValue CopyParamOps[] = { Chain, 1617 DAG.getConstant(paramCount, dl, MVT::i32), 1618 DAG.getConstant(curOffset, dl, MVT::i32), 1619 theVal, InFlag }; 1620 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, 1621 CopyParamOps, elemtype, 1622 MachinePointerInfo(), /* Align */ 0, 1623 MachineMemOperand::MOStore); 1624 1625 InFlag = Chain.getValue(1); 1626 } 1627 ++paramCount; 1628 } 1629 1630 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1631 unsigned retAlignment = 0; 1632 1633 // Handle Result 1634 if (Ins.size() > 0) { 1635 SmallVector<EVT, 16> resvtparts; 1636 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1637 1638 // Declare 1639 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1640 // .param .b<size-in-bits> retval0 1641 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1642 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1643 // these three types to match the logic in 1644 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1645 // Plus, this behavior is consistent with nvcc's. 1646 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1647 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1648 // Scalar needs to be at least 32bit wide 1649 if (resultsz < 32) 1650 resultsz = 32; 1651 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1652 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1653 DAG.getConstant(resultsz, dl, MVT::i32), 1654 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1655 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1656 DeclareRetOps); 1657 InFlag = Chain.getValue(1); 1658 } else { 1659 retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL); 1660 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1661 SDValue DeclareRetOps[] = { Chain, 1662 DAG.getConstant(retAlignment, dl, MVT::i32), 1663 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1664 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1665 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1666 DeclareRetOps); 1667 InFlag = Chain.getValue(1); 1668 } 1669 } 1670 1671 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1672 // between them we must rely on the call site value which is valid for 1673 // indirect calls but is always null for libcalls. 1674 bool isIndirectCall = !Func && CS; 1675 1676 if (isa<ExternalSymbolSDNode>(Callee)) { 1677 Function* CalleeFunc = nullptr; 1678 1679 // Try to find the callee in the current module. 1680 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1681 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1682 1683 // Set the "libcall callee" attribute to indicate that the function 1684 // must always have a declaration. 1685 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1686 } 1687 1688 if (isIndirectCall) { 1689 // This is indirect function call case : PTX requires a prototype of the 1690 // form 1691 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1692 // to be emitted, and the label has to used as the last arg of call 1693 // instruction. 1694 // The prototype is embedded in a string and put as the operand for a 1695 // CallPrototype SDNode which will print out to the value of the string. 1696 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1697 std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS); 1698 const char *ProtoStr = 1699 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1700 SDValue ProtoOps[] = { 1701 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1702 }; 1703 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1704 InFlag = Chain.getValue(1); 1705 } 1706 // Op to just print "call" 1707 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1708 SDValue PrintCallOps[] = { 1709 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1710 }; 1711 // We model convergent calls as separate opcodes. 1712 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1713 if (CLI.IsConvergent) 1714 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1715 : NVPTXISD::PrintConvergentCall; 1716 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1717 InFlag = Chain.getValue(1); 1718 1719 // Ops to print out the function name 1720 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1721 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1722 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1723 InFlag = Chain.getValue(1); 1724 1725 // Ops to print out the param list 1726 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1727 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1728 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1729 CallArgBeginOps); 1730 InFlag = Chain.getValue(1); 1731 1732 for (unsigned i = 0, e = paramCount; i != e; ++i) { 1733 unsigned opcode; 1734 if (i == (e - 1)) 1735 opcode = NVPTXISD::LastCallArg; 1736 else 1737 opcode = NVPTXISD::CallArg; 1738 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1739 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1740 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1741 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1742 InFlag = Chain.getValue(1); 1743 } 1744 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1745 SDValue CallArgEndOps[] = { Chain, 1746 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1747 InFlag }; 1748 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1749 InFlag = Chain.getValue(1); 1750 1751 if (isIndirectCall) { 1752 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1753 SDValue PrototypeOps[] = { Chain, 1754 DAG.getConstant(uniqueCallSite, dl, MVT::i32), 1755 InFlag }; 1756 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1757 InFlag = Chain.getValue(1); 1758 } 1759 1760 SmallVector<SDValue, 16> ProxyRegOps; 1761 SmallVector<Optional<MVT>, 16> ProxyRegTruncates; 1762 1763 // Generate loads from param memory/moves from registers for result 1764 if (Ins.size() > 0) { 1765 SmallVector<EVT, 16> VTs; 1766 SmallVector<uint64_t, 16> Offsets; 1767 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1768 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1769 1770 unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL); 1771 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1772 1773 SmallVector<EVT, 6> LoadVTs; 1774 int VecIdx = -1; // Index of the first element of the vector. 1775 1776 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1777 // 32-bits are sign extended or zero extended, depending on whether 1778 // they are signed or unsigned types. 1779 bool ExtendIntegerRetVal = 1780 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1781 1782 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1783 bool needTruncate = false; 1784 EVT TheLoadType = VTs[i]; 1785 EVT EltType = Ins[i].VT; 1786 unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]); 1787 if (ExtendIntegerRetVal) { 1788 TheLoadType = MVT::i32; 1789 EltType = MVT::i32; 1790 needTruncate = true; 1791 } else if (TheLoadType.getSizeInBits() < 16) { 1792 if (VTs[i].isInteger()) 1793 needTruncate = true; 1794 EltType = MVT::i16; 1795 } 1796 1797 // Record index of the very first element of the vector. 1798 if (VectorInfo[i] & PVF_FIRST) { 1799 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1800 VecIdx = i; 1801 } 1802 1803 LoadVTs.push_back(EltType); 1804 1805 if (VectorInfo[i] & PVF_LAST) { 1806 unsigned NumElts = LoadVTs.size(); 1807 LoadVTs.push_back(MVT::Other); 1808 LoadVTs.push_back(MVT::Glue); 1809 NVPTXISD::NodeType Op; 1810 switch (NumElts) { 1811 case 1: 1812 Op = NVPTXISD::LoadParam; 1813 break; 1814 case 2: 1815 Op = NVPTXISD::LoadParamV2; 1816 break; 1817 case 4: 1818 Op = NVPTXISD::LoadParamV4; 1819 break; 1820 default: 1821 llvm_unreachable("Invalid vector info."); 1822 } 1823 1824 SDValue LoadOperands[] = { 1825 Chain, DAG.getConstant(1, dl, MVT::i32), 1826 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1827 SDValue RetVal = DAG.getMemIntrinsicNode( 1828 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1829 MachinePointerInfo(), EltAlign, 1830 MachineMemOperand::MOLoad); 1831 1832 for (unsigned j = 0; j < NumElts; ++j) { 1833 ProxyRegOps.push_back(RetVal.getValue(j)); 1834 1835 if (needTruncate) 1836 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT)); 1837 else 1838 ProxyRegTruncates.push_back(Optional<MVT>()); 1839 } 1840 1841 Chain = RetVal.getValue(NumElts); 1842 InFlag = RetVal.getValue(NumElts + 1); 1843 1844 // Cleanup 1845 VecIdx = -1; 1846 LoadVTs.clear(); 1847 } 1848 } 1849 } 1850 1851 Chain = DAG.getCALLSEQ_END(Chain, 1852 DAG.getIntPtrConstant(uniqueCallSite, dl, true), 1853 DAG.getIntPtrConstant(uniqueCallSite + 1, dl, 1854 true), 1855 InFlag, dl); 1856 InFlag = Chain.getValue(1); 1857 uniqueCallSite++; 1858 1859 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1860 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1861 // dangling. 1862 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1863 SDValue Ret = DAG.getNode( 1864 NVPTXISD::ProxyReg, dl, 1865 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1866 { Chain, ProxyRegOps[i], InFlag } 1867 ); 1868 1869 Chain = Ret.getValue(1); 1870 InFlag = Ret.getValue(2); 1871 1872 if (ProxyRegTruncates[i].hasValue()) { 1873 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); 1874 } 1875 1876 InVals.push_back(Ret); 1877 } 1878 1879 // set isTailCall to false for now, until we figure out how to express 1880 // tail call optimization in PTX 1881 isTailCall = false; 1882 return Chain; 1883 } 1884 1885 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1886 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1887 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1888 SDValue 1889 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1890 SDNode *Node = Op.getNode(); 1891 SDLoc dl(Node); 1892 SmallVector<SDValue, 8> Ops; 1893 unsigned NumOperands = Node->getNumOperands(); 1894 for (unsigned i = 0; i < NumOperands; ++i) { 1895 SDValue SubOp = Node->getOperand(i); 1896 EVT VVT = SubOp.getNode()->getValueType(0); 1897 EVT EltVT = VVT.getVectorElementType(); 1898 unsigned NumSubElem = VVT.getVectorNumElements(); 1899 for (unsigned j = 0; j < NumSubElem; ++j) { 1900 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1901 DAG.getIntPtrConstant(j, dl))); 1902 } 1903 } 1904 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 1905 } 1906 1907 // We can init constant f16x2 with a single .b32 move. Normally it 1908 // would get lowered as two constant loads and vector-packing move. 1909 // mov.b16 %h1, 0x4000; 1910 // mov.b16 %h2, 0x3C00; 1911 // mov.b32 %hh2, {%h2, %h1}; 1912 // Instead we want just a constant move: 1913 // mov.b32 %hh2, 0x40003C00 1914 // 1915 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 1916 // generates good SASS in both cases. 1917 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 1918 SelectionDAG &DAG) const { 1919 //return Op; 1920 if (!(Op->getValueType(0) == MVT::v2f16 && 1921 isa<ConstantFPSDNode>(Op->getOperand(0)) && 1922 isa<ConstantFPSDNode>(Op->getOperand(1)))) 1923 return Op; 1924 1925 APInt E0 = 1926 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 1927 APInt E1 = 1928 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 1929 SDValue Const = 1930 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 1931 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 1932 } 1933 1934 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 1935 SelectionDAG &DAG) const { 1936 SDValue Index = Op->getOperand(1); 1937 // Constant index will be matched by tablegen. 1938 if (isa<ConstantSDNode>(Index.getNode())) 1939 return Op; 1940 1941 // Extract individual elements and select one of them. 1942 SDValue Vector = Op->getOperand(0); 1943 EVT VectorVT = Vector.getValueType(); 1944 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 1945 EVT EltVT = VectorVT.getVectorElementType(); 1946 1947 SDLoc dl(Op.getNode()); 1948 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1949 DAG.getIntPtrConstant(0, dl)); 1950 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1951 DAG.getIntPtrConstant(1, dl)); 1952 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 1953 ISD::CondCode::SETEQ); 1954 } 1955 1956 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1957 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1958 /// amount, or 1959 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1960 /// amount. 1961 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 1962 SelectionDAG &DAG) const { 1963 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1964 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 1965 1966 EVT VT = Op.getValueType(); 1967 unsigned VTBits = VT.getSizeInBits(); 1968 SDLoc dl(Op); 1969 SDValue ShOpLo = Op.getOperand(0); 1970 SDValue ShOpHi = Op.getOperand(1); 1971 SDValue ShAmt = Op.getOperand(2); 1972 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 1973 1974 if (VTBits == 32 && STI.getSmVersion() >= 35) { 1975 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1976 // {dHi, dLo} = {aHi, aLo} >> Amt 1977 // dHi = aHi >> Amt 1978 // dLo = shf.r.clamp aLo, aHi, Amt 1979 1980 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1981 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 1982 ShAmt); 1983 1984 SDValue Ops[2] = { Lo, Hi }; 1985 return DAG.getMergeValues(Ops, dl); 1986 } 1987 else { 1988 // {dHi, dLo} = {aHi, aLo} >> Amt 1989 // - if (Amt>=size) then 1990 // dLo = aHi >> (Amt-size) 1991 // dHi = aHi >> Amt (this is either all 0 or all 1) 1992 // else 1993 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 1994 // dHi = aHi >> Amt 1995 1996 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1997 DAG.getConstant(VTBits, dl, MVT::i32), 1998 ShAmt); 1999 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2000 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2001 DAG.getConstant(VTBits, dl, MVT::i32)); 2002 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2003 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2004 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2005 2006 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2007 DAG.getConstant(VTBits, dl, MVT::i32), 2008 ISD::SETGE); 2009 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2010 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2011 2012 SDValue Ops[2] = { Lo, Hi }; 2013 return DAG.getMergeValues(Ops, dl); 2014 } 2015 } 2016 2017 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2018 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2019 /// amount, or 2020 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2021 /// amount. 2022 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2023 SelectionDAG &DAG) const { 2024 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2025 assert(Op.getOpcode() == ISD::SHL_PARTS); 2026 2027 EVT VT = Op.getValueType(); 2028 unsigned VTBits = VT.getSizeInBits(); 2029 SDLoc dl(Op); 2030 SDValue ShOpLo = Op.getOperand(0); 2031 SDValue ShOpHi = Op.getOperand(1); 2032 SDValue ShAmt = Op.getOperand(2); 2033 2034 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2035 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2036 // {dHi, dLo} = {aHi, aLo} << Amt 2037 // dHi = shf.l.clamp aLo, aHi, Amt 2038 // dLo = aLo << Amt 2039 2040 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2041 ShAmt); 2042 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2043 2044 SDValue Ops[2] = { Lo, Hi }; 2045 return DAG.getMergeValues(Ops, dl); 2046 } 2047 else { 2048 // {dHi, dLo} = {aHi, aLo} << Amt 2049 // - if (Amt>=size) then 2050 // dLo = aLo << Amt (all 0) 2051 // dLo = aLo << (Amt-size) 2052 // else 2053 // dLo = aLo << Amt 2054 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2055 2056 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2057 DAG.getConstant(VTBits, dl, MVT::i32), 2058 ShAmt); 2059 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2060 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2061 DAG.getConstant(VTBits, dl, MVT::i32)); 2062 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2063 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2064 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2065 2066 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2067 DAG.getConstant(VTBits, dl, MVT::i32), 2068 ISD::SETGE); 2069 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2070 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2071 2072 SDValue Ops[2] = { Lo, Hi }; 2073 return DAG.getMergeValues(Ops, dl); 2074 } 2075 } 2076 2077 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2078 EVT VT = Op.getValueType(); 2079 2080 if (VT == MVT::f32) 2081 return LowerFROUND32(Op, DAG); 2082 2083 if (VT == MVT::f64) 2084 return LowerFROUND64(Op, DAG); 2085 2086 llvm_unreachable("unhandled type"); 2087 } 2088 2089 // This is the the rounding method used in CUDA libdevice in C like code: 2090 // float roundf(float A) 2091 // { 2092 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2093 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2094 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2095 // } 2096 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2097 SelectionDAG &DAG) const { 2098 SDLoc SL(Op); 2099 SDValue A = Op.getOperand(0); 2100 EVT VT = Op.getValueType(); 2101 2102 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2103 2104 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2105 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2106 const int SignBitMask = 0x80000000; 2107 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2108 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2109 const int PointFiveInBits = 0x3F000000; 2110 SDValue PointFiveWithSignRaw = 2111 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2112 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2113 SDValue PointFiveWithSign = 2114 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2115 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2116 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2117 2118 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2119 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2120 SDValue IsLarge = 2121 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2122 ISD::SETOGT); 2123 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2124 2125 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2126 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2127 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2128 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2129 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2130 } 2131 2132 // The implementation of round(double) is similar to that of round(float) in 2133 // that they both separate the value range into three regions and use a method 2134 // specific to the region to round the values. However, round(double) first 2135 // calculates the round of the absolute value and then adds the sign back while 2136 // round(float) directly rounds the value with sign. 2137 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2138 SelectionDAG &DAG) const { 2139 SDLoc SL(Op); 2140 SDValue A = Op.getOperand(0); 2141 EVT VT = Op.getValueType(); 2142 2143 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2144 2145 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2146 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2147 DAG.getConstantFP(0.5, SL, VT)); 2148 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2149 2150 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2151 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2152 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2153 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2154 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2155 DAG.getConstantFP(0, SL, VT), 2156 RoundedA); 2157 2158 // Add sign to rounded_A 2159 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2160 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2161 2162 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2163 SDValue IsLarge = 2164 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2165 ISD::SETOGT); 2166 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2167 } 2168 2169 2170 2171 SDValue 2172 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2173 switch (Op.getOpcode()) { 2174 case ISD::RETURNADDR: 2175 return SDValue(); 2176 case ISD::FRAMEADDR: 2177 return SDValue(); 2178 case ISD::GlobalAddress: 2179 return LowerGlobalAddress(Op, DAG); 2180 case ISD::INTRINSIC_W_CHAIN: 2181 return Op; 2182 case ISD::BUILD_VECTOR: 2183 return LowerBUILD_VECTOR(Op, DAG); 2184 case ISD::EXTRACT_SUBVECTOR: 2185 return Op; 2186 case ISD::EXTRACT_VECTOR_ELT: 2187 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2188 case ISD::CONCAT_VECTORS: 2189 return LowerCONCAT_VECTORS(Op, DAG); 2190 case ISD::STORE: 2191 return LowerSTORE(Op, DAG); 2192 case ISD::LOAD: 2193 return LowerLOAD(Op, DAG); 2194 case ISD::SHL_PARTS: 2195 return LowerShiftLeftParts(Op, DAG); 2196 case ISD::SRA_PARTS: 2197 case ISD::SRL_PARTS: 2198 return LowerShiftRightParts(Op, DAG); 2199 case ISD::SELECT: 2200 return LowerSelect(Op, DAG); 2201 case ISD::FROUND: 2202 return LowerFROUND(Op, DAG); 2203 default: 2204 llvm_unreachable("Custom lowering not defined for operation"); 2205 } 2206 } 2207 2208 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2209 SDValue Op0 = Op->getOperand(0); 2210 SDValue Op1 = Op->getOperand(1); 2211 SDValue Op2 = Op->getOperand(2); 2212 SDLoc DL(Op.getNode()); 2213 2214 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2215 2216 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2217 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2218 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2219 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2220 2221 return Trunc; 2222 } 2223 2224 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2225 if (Op.getValueType() == MVT::i1) 2226 return LowerLOADi1(Op, DAG); 2227 2228 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2229 // loads and have to handle it here. 2230 if (Op.getValueType() == MVT::v2f16) { 2231 LoadSDNode *Load = cast<LoadSDNode>(Op); 2232 EVT MemVT = Load->getMemoryVT(); 2233 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2234 MemVT, *Load->getMemOperand())) { 2235 SDValue Ops[2]; 2236 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2237 return DAG.getMergeValues(Ops, SDLoc(Op)); 2238 } 2239 } 2240 2241 return SDValue(); 2242 } 2243 2244 // v = ld i1* addr 2245 // => 2246 // v1 = ld i8* addr (-> i16) 2247 // v = trunc i16 to i1 2248 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2249 SDNode *Node = Op.getNode(); 2250 LoadSDNode *LD = cast<LoadSDNode>(Node); 2251 SDLoc dl(Node); 2252 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2253 assert(Node->getValueType(0) == MVT::i1 && 2254 "Custom lowering for i1 load only"); 2255 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2256 LD->getPointerInfo(), LD->getAlignment(), 2257 LD->getMemOperand()->getFlags()); 2258 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2259 // The legalizer (the caller) is expecting two values from the legalized 2260 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2261 // in LegalizeDAG.cpp which also uses MergeValues. 2262 SDValue Ops[] = { result, LD->getChain() }; 2263 return DAG.getMergeValues(Ops, dl); 2264 } 2265 2266 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2267 StoreSDNode *Store = cast<StoreSDNode>(Op); 2268 EVT VT = Store->getMemoryVT(); 2269 2270 if (VT == MVT::i1) 2271 return LowerSTOREi1(Op, DAG); 2272 2273 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2274 // stores and have to handle it here. 2275 if (VT == MVT::v2f16 && 2276 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2277 VT, *Store->getMemOperand())) 2278 return expandUnalignedStore(Store, DAG); 2279 2280 if (VT.isVector()) 2281 return LowerSTOREVector(Op, DAG); 2282 2283 return SDValue(); 2284 } 2285 2286 SDValue 2287 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2288 SDNode *N = Op.getNode(); 2289 SDValue Val = N->getOperand(1); 2290 SDLoc DL(N); 2291 EVT ValVT = Val.getValueType(); 2292 2293 if (ValVT.isVector()) { 2294 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2295 // legal. We can (and should) split that into 2 stores of <2 x double> here 2296 // but I'm leaving that as a TODO for now. 2297 if (!ValVT.isSimple()) 2298 return SDValue(); 2299 switch (ValVT.getSimpleVT().SimpleTy) { 2300 default: 2301 return SDValue(); 2302 case MVT::v2i8: 2303 case MVT::v2i16: 2304 case MVT::v2i32: 2305 case MVT::v2i64: 2306 case MVT::v2f16: 2307 case MVT::v2f32: 2308 case MVT::v2f64: 2309 case MVT::v4i8: 2310 case MVT::v4i16: 2311 case MVT::v4i32: 2312 case MVT::v4f16: 2313 case MVT::v4f32: 2314 case MVT::v8f16: // <4 x f16x2> 2315 // This is a "native" vector type 2316 break; 2317 } 2318 2319 MemSDNode *MemSD = cast<MemSDNode>(N); 2320 const DataLayout &TD = DAG.getDataLayout(); 2321 2322 unsigned Align = MemSD->getAlignment(); 2323 unsigned PrefAlign = 2324 TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext())); 2325 if (Align < PrefAlign) { 2326 // This store is not sufficiently aligned, so bail out and let this vector 2327 // store be scalarized. Note that we may still be able to emit smaller 2328 // vector stores. For example, if we are storing a <4 x float> with an 2329 // alignment of 8, this check will fail but the legalizer will try again 2330 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2331 return SDValue(); 2332 } 2333 2334 unsigned Opcode = 0; 2335 EVT EltVT = ValVT.getVectorElementType(); 2336 unsigned NumElts = ValVT.getVectorNumElements(); 2337 2338 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2339 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2340 // stored type to i16 and propagate the "real" type as the memory type. 2341 bool NeedExt = false; 2342 if (EltVT.getSizeInBits() < 16) 2343 NeedExt = true; 2344 2345 bool StoreF16x2 = false; 2346 switch (NumElts) { 2347 default: 2348 return SDValue(); 2349 case 2: 2350 Opcode = NVPTXISD::StoreV2; 2351 break; 2352 case 4: 2353 Opcode = NVPTXISD::StoreV4; 2354 break; 2355 case 8: 2356 // v8f16 is a special case. PTX doesn't have st.v8.f16 2357 // instruction. Instead, we split the vector into v2f16 chunks and 2358 // store them with st.v4.b32. 2359 assert(EltVT == MVT::f16 && "Wrong type for the vector."); 2360 Opcode = NVPTXISD::StoreV4; 2361 StoreF16x2 = true; 2362 break; 2363 } 2364 2365 SmallVector<SDValue, 8> Ops; 2366 2367 // First is the chain 2368 Ops.push_back(N->getOperand(0)); 2369 2370 if (StoreF16x2) { 2371 // Combine f16,f16 -> v2f16 2372 NumElts /= 2; 2373 for (unsigned i = 0; i < NumElts; ++i) { 2374 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2375 DAG.getIntPtrConstant(i * 2, DL)); 2376 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2377 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2378 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2379 Ops.push_back(V2); 2380 } 2381 } else { 2382 // Then the split values 2383 for (unsigned i = 0; i < NumElts; ++i) { 2384 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2385 DAG.getIntPtrConstant(i, DL)); 2386 if (NeedExt) 2387 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2388 Ops.push_back(ExtVal); 2389 } 2390 } 2391 2392 // Then any remaining arguments 2393 Ops.append(N->op_begin() + 2, N->op_end()); 2394 2395 SDValue NewSt = 2396 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2397 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2398 2399 // return DCI.CombineTo(N, NewSt, true); 2400 return NewSt; 2401 } 2402 2403 return SDValue(); 2404 } 2405 2406 // st i1 v, addr 2407 // => 2408 // v1 = zxt v to i16 2409 // st.u8 i16, addr 2410 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2411 SDNode *Node = Op.getNode(); 2412 SDLoc dl(Node); 2413 StoreSDNode *ST = cast<StoreSDNode>(Node); 2414 SDValue Tmp1 = ST->getChain(); 2415 SDValue Tmp2 = ST->getBasePtr(); 2416 SDValue Tmp3 = ST->getValue(); 2417 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2418 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2419 SDValue Result = 2420 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2421 ST->getAlignment(), ST->getMemOperand()->getFlags()); 2422 return Result; 2423 } 2424 2425 SDValue 2426 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 2427 std::string ParamSym; 2428 raw_string_ostream ParamStr(ParamSym); 2429 2430 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 2431 ParamStr.flush(); 2432 2433 std::string *SavedStr = 2434 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 2435 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 2436 } 2437 2438 // Check to see if the kernel argument is image*_t or sampler_t 2439 2440 static bool isImageOrSamplerVal(const Value *arg, const Module *context) { 2441 static const char *const specialTypes[] = { "struct._image2d_t", 2442 "struct._image3d_t", 2443 "struct._sampler_t" }; 2444 2445 Type *Ty = arg->getType(); 2446 auto *PTy = dyn_cast<PointerType>(Ty); 2447 2448 if (!PTy) 2449 return false; 2450 2451 if (!context) 2452 return false; 2453 2454 auto *STy = dyn_cast<StructType>(PTy->getElementType()); 2455 if (!STy || STy->isLiteral()) 2456 return false; 2457 2458 return std::find(std::begin(specialTypes), std::end(specialTypes), 2459 STy->getName()) != std::end(specialTypes); 2460 } 2461 2462 SDValue NVPTXTargetLowering::LowerFormalArguments( 2463 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2464 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2465 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2466 MachineFunction &MF = DAG.getMachineFunction(); 2467 const DataLayout &DL = DAG.getDataLayout(); 2468 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2469 2470 const Function *F = &MF.getFunction(); 2471 const AttributeList &PAL = F->getAttributes(); 2472 const TargetLowering *TLI = STI.getTargetLowering(); 2473 2474 SDValue Root = DAG.getRoot(); 2475 std::vector<SDValue> OutChains; 2476 2477 bool isABI = (STI.getSmVersion() >= 20); 2478 assert(isABI && "Non-ABI compilation is not supported"); 2479 if (!isABI) 2480 return Chain; 2481 2482 std::vector<Type *> argTypes; 2483 std::vector<const Argument *> theArgs; 2484 for (const Argument &I : F->args()) { 2485 theArgs.push_back(&I); 2486 argTypes.push_back(I.getType()); 2487 } 2488 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2489 // Ins.size() will be larger 2490 // * if there is an aggregate argument with multiple fields (each field 2491 // showing up separately in Ins) 2492 // * if there is a vector argument with more than typical vector-length 2493 // elements (generally if more than 4) where each vector element is 2494 // individually present in Ins. 2495 // So a different index should be used for indexing into Ins. 2496 // See similar issue in LowerCall. 2497 unsigned InsIdx = 0; 2498 2499 int idx = 0; 2500 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2501 Type *Ty = argTypes[i]; 2502 2503 // If the kernel argument is image*_t or sampler_t, convert it to 2504 // a i32 constant holding the parameter position. This can later 2505 // matched in the AsmPrinter to output the correct mangled name. 2506 if (isImageOrSamplerVal( 2507 theArgs[i], 2508 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() 2509 : nullptr))) { 2510 assert(isKernelFunction(*F) && 2511 "Only kernels can have image/sampler params"); 2512 InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); 2513 continue; 2514 } 2515 2516 if (theArgs[i]->use_empty()) { 2517 // argument is dead 2518 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2519 SmallVector<EVT, 16> vtparts; 2520 2521 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2522 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2523 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2524 ++parti) { 2525 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2526 ++InsIdx; 2527 } 2528 if (vtparts.size() > 0) 2529 --InsIdx; 2530 continue; 2531 } 2532 if (Ty->isVectorTy()) { 2533 EVT ObjectVT = getValueType(DL, Ty); 2534 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2535 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2536 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2537 ++InsIdx; 2538 } 2539 if (NumRegs > 0) 2540 --InsIdx; 2541 continue; 2542 } 2543 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2544 continue; 2545 } 2546 2547 // In the following cases, assign a node order of "idx+1" 2548 // to newly created nodes. The SDNodes for params have to 2549 // appear in the same order as their order of appearance 2550 // in the original function. "idx+1" holds that order. 2551 if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { 2552 bool aggregateIsPacked = false; 2553 if (StructType *STy = dyn_cast<StructType>(Ty)) 2554 aggregateIsPacked = STy->isPacked(); 2555 2556 SmallVector<EVT, 16> VTs; 2557 SmallVector<uint64_t, 16> Offsets; 2558 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2559 assert(VTs.size() > 0 && "Unexpected empty type."); 2560 auto VectorInfo = 2561 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty)); 2562 2563 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2564 int VecIdx = -1; // Index of the first element of the current vector. 2565 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2566 if (VectorInfo[parti] & PVF_FIRST) { 2567 assert(VecIdx == -1 && "Orphaned vector."); 2568 VecIdx = parti; 2569 } 2570 2571 // That's the last element of this store op. 2572 if (VectorInfo[parti] & PVF_LAST) { 2573 unsigned NumElts = parti - VecIdx + 1; 2574 EVT EltVT = VTs[parti]; 2575 // i1 is loaded/stored as i8. 2576 EVT LoadVT = EltVT; 2577 if (EltVT == MVT::i1) 2578 LoadVT = MVT::i8; 2579 else if (EltVT == MVT::v2f16) 2580 // getLoad needs a vector type, but it can't handle 2581 // vectors which contain v2f16 elements. So we must load 2582 // using i32 here and then bitcast back. 2583 LoadVT = MVT::i32; 2584 2585 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2586 SDValue VecAddr = 2587 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2588 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2589 Value *srcValue = Constant::getNullValue(PointerType::get( 2590 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2591 SDValue P = 2592 DAG.getLoad(VecVT, dl, Root, VecAddr, 2593 MachinePointerInfo(srcValue), aggregateIsPacked, 2594 MachineMemOperand::MODereferenceable | 2595 MachineMemOperand::MOInvariant); 2596 if (P.getNode()) 2597 P.getNode()->setIROrder(idx + 1); 2598 for (unsigned j = 0; j < NumElts; ++j) { 2599 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2600 DAG.getIntPtrConstant(j, dl)); 2601 // We've loaded i1 as an i8 and now must truncate it back to i1 2602 if (EltVT == MVT::i1) 2603 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2604 // v2f16 was loaded as an i32. Now we must bitcast it back. 2605 else if (EltVT == MVT::v2f16) 2606 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2607 // Extend the element if necessary (e.g. an i8 is loaded 2608 // into an i16 register) 2609 if (Ins[InsIdx].VT.isInteger() && 2610 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { 2611 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2612 : ISD::ZERO_EXTEND; 2613 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2614 } 2615 InVals.push_back(Elt); 2616 } 2617 2618 // Reset vector tracking state. 2619 VecIdx = -1; 2620 } 2621 ++InsIdx; 2622 } 2623 if (VTs.size() > 0) 2624 --InsIdx; 2625 continue; 2626 } 2627 2628 // Param has ByVal attribute 2629 // Return MoveParam(param symbol). 2630 // Ideally, the param symbol can be returned directly, 2631 // but when SDNode builder decides to use it in a CopyToReg(), 2632 // machine instruction fails because TargetExternalSymbol 2633 // (not lowered) is target dependent, and CopyToReg assumes 2634 // the source is lowered. 2635 EVT ObjectVT = getValueType(DL, Ty); 2636 assert(ObjectVT == Ins[InsIdx].VT && 2637 "Ins type did not match function type"); 2638 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2639 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2640 if (p.getNode()) 2641 p.getNode()->setIROrder(idx + 1); 2642 InVals.push_back(p); 2643 } 2644 2645 // Clang will check explicit VarArg and issue error if any. However, Clang 2646 // will let code with 2647 // implicit var arg like f() pass. See bug 617733. 2648 // We treat this case as if the arg list is empty. 2649 // if (F.isVarArg()) { 2650 // assert(0 && "VarArg not supported yet!"); 2651 //} 2652 2653 if (!OutChains.empty()) 2654 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2655 2656 return Chain; 2657 } 2658 2659 SDValue 2660 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2661 bool isVarArg, 2662 const SmallVectorImpl<ISD::OutputArg> &Outs, 2663 const SmallVectorImpl<SDValue> &OutVals, 2664 const SDLoc &dl, SelectionDAG &DAG) const { 2665 MachineFunction &MF = DAG.getMachineFunction(); 2666 Type *RetTy = MF.getFunction().getReturnType(); 2667 2668 bool isABI = (STI.getSmVersion() >= 20); 2669 assert(isABI && "Non-ABI compilation is not supported"); 2670 if (!isABI) 2671 return Chain; 2672 2673 const DataLayout DL = DAG.getDataLayout(); 2674 SmallVector<EVT, 16> VTs; 2675 SmallVector<uint64_t, 16> Offsets; 2676 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2677 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2678 2679 auto VectorInfo = VectorizePTXValueVTs( 2680 VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1); 2681 2682 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2683 // 32-bits are sign extended or zero extended, depending on whether 2684 // they are signed or unsigned types. 2685 bool ExtendIntegerRetVal = 2686 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2687 2688 SmallVector<SDValue, 6> StoreOperands; 2689 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2690 // New load/store. Record chain and offset operands. 2691 if (VectorInfo[i] & PVF_FIRST) { 2692 assert(StoreOperands.empty() && "Orphaned operand list."); 2693 StoreOperands.push_back(Chain); 2694 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2695 } 2696 2697 SDValue RetVal = OutVals[i]; 2698 if (ExtendIntegerRetVal) { 2699 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2700 : ISD::ZERO_EXTEND, 2701 dl, MVT::i32, RetVal); 2702 } else if (RetVal.getValueSizeInBits() < 16) { 2703 // Use 16-bit registers for small load-stores as it's the 2704 // smallest general purpose register size supported by NVPTX. 2705 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2706 } 2707 2708 // Record the value to return. 2709 StoreOperands.push_back(RetVal); 2710 2711 // That's the last element of this store op. 2712 if (VectorInfo[i] & PVF_LAST) { 2713 NVPTXISD::NodeType Op; 2714 unsigned NumElts = StoreOperands.size() - 2; 2715 switch (NumElts) { 2716 case 1: 2717 Op = NVPTXISD::StoreRetval; 2718 break; 2719 case 2: 2720 Op = NVPTXISD::StoreRetvalV2; 2721 break; 2722 case 4: 2723 Op = NVPTXISD::StoreRetvalV4; 2724 break; 2725 default: 2726 llvm_unreachable("Invalid vector info."); 2727 } 2728 2729 // Adjust type of load/store op if we've extended the scalar 2730 // return value. 2731 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2732 Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), 2733 StoreOperands, TheStoreType, 2734 MachinePointerInfo(), /* Align */ 1, 2735 MachineMemOperand::MOStore); 2736 // Cleanup vector state. 2737 StoreOperands.clear(); 2738 } 2739 } 2740 2741 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2742 } 2743 2744 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2745 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2746 SelectionDAG &DAG) const { 2747 if (Constraint.length() > 1) 2748 return; 2749 else 2750 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2751 } 2752 2753 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2754 switch (Intrinsic) { 2755 default: 2756 return 0; 2757 2758 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2759 return NVPTXISD::Tex1DFloatS32; 2760 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2761 return NVPTXISD::Tex1DFloatFloat; 2762 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2763 return NVPTXISD::Tex1DFloatFloatLevel; 2764 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2765 return NVPTXISD::Tex1DFloatFloatGrad; 2766 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2767 return NVPTXISD::Tex1DS32S32; 2768 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2769 return NVPTXISD::Tex1DS32Float; 2770 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2771 return NVPTXISD::Tex1DS32FloatLevel; 2772 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2773 return NVPTXISD::Tex1DS32FloatGrad; 2774 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2775 return NVPTXISD::Tex1DU32S32; 2776 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2777 return NVPTXISD::Tex1DU32Float; 2778 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2779 return NVPTXISD::Tex1DU32FloatLevel; 2780 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2781 return NVPTXISD::Tex1DU32FloatGrad; 2782 2783 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2784 return NVPTXISD::Tex1DArrayFloatS32; 2785 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2786 return NVPTXISD::Tex1DArrayFloatFloat; 2787 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2788 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2789 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2790 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2791 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2792 return NVPTXISD::Tex1DArrayS32S32; 2793 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2794 return NVPTXISD::Tex1DArrayS32Float; 2795 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2796 return NVPTXISD::Tex1DArrayS32FloatLevel; 2797 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2798 return NVPTXISD::Tex1DArrayS32FloatGrad; 2799 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2800 return NVPTXISD::Tex1DArrayU32S32; 2801 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2802 return NVPTXISD::Tex1DArrayU32Float; 2803 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2804 return NVPTXISD::Tex1DArrayU32FloatLevel; 2805 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2806 return NVPTXISD::Tex1DArrayU32FloatGrad; 2807 2808 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2809 return NVPTXISD::Tex2DFloatS32; 2810 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2811 return NVPTXISD::Tex2DFloatFloat; 2812 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2813 return NVPTXISD::Tex2DFloatFloatLevel; 2814 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2815 return NVPTXISD::Tex2DFloatFloatGrad; 2816 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2817 return NVPTXISD::Tex2DS32S32; 2818 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2819 return NVPTXISD::Tex2DS32Float; 2820 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2821 return NVPTXISD::Tex2DS32FloatLevel; 2822 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2823 return NVPTXISD::Tex2DS32FloatGrad; 2824 case Intrinsic::nvvm_tex_2d_v4u32_s32: 2825 return NVPTXISD::Tex2DU32S32; 2826 case Intrinsic::nvvm_tex_2d_v4u32_f32: 2827 return NVPTXISD::Tex2DU32Float; 2828 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 2829 return NVPTXISD::Tex2DU32FloatLevel; 2830 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 2831 return NVPTXISD::Tex2DU32FloatGrad; 2832 2833 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 2834 return NVPTXISD::Tex2DArrayFloatS32; 2835 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2836 return NVPTXISD::Tex2DArrayFloatFloat; 2837 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2838 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2839 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2840 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2841 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 2842 return NVPTXISD::Tex2DArrayS32S32; 2843 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 2844 return NVPTXISD::Tex2DArrayS32Float; 2845 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 2846 return NVPTXISD::Tex2DArrayS32FloatLevel; 2847 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 2848 return NVPTXISD::Tex2DArrayS32FloatGrad; 2849 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 2850 return NVPTXISD::Tex2DArrayU32S32; 2851 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 2852 return NVPTXISD::Tex2DArrayU32Float; 2853 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 2854 return NVPTXISD::Tex2DArrayU32FloatLevel; 2855 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 2856 return NVPTXISD::Tex2DArrayU32FloatGrad; 2857 2858 case Intrinsic::nvvm_tex_3d_v4f32_s32: 2859 return NVPTXISD::Tex3DFloatS32; 2860 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2861 return NVPTXISD::Tex3DFloatFloat; 2862 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2863 return NVPTXISD::Tex3DFloatFloatLevel; 2864 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2865 return NVPTXISD::Tex3DFloatFloatGrad; 2866 case Intrinsic::nvvm_tex_3d_v4s32_s32: 2867 return NVPTXISD::Tex3DS32S32; 2868 case Intrinsic::nvvm_tex_3d_v4s32_f32: 2869 return NVPTXISD::Tex3DS32Float; 2870 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 2871 return NVPTXISD::Tex3DS32FloatLevel; 2872 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 2873 return NVPTXISD::Tex3DS32FloatGrad; 2874 case Intrinsic::nvvm_tex_3d_v4u32_s32: 2875 return NVPTXISD::Tex3DU32S32; 2876 case Intrinsic::nvvm_tex_3d_v4u32_f32: 2877 return NVPTXISD::Tex3DU32Float; 2878 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 2879 return NVPTXISD::Tex3DU32FloatLevel; 2880 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 2881 return NVPTXISD::Tex3DU32FloatGrad; 2882 2883 case Intrinsic::nvvm_tex_cube_v4f32_f32: 2884 return NVPTXISD::TexCubeFloatFloat; 2885 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 2886 return NVPTXISD::TexCubeFloatFloatLevel; 2887 case Intrinsic::nvvm_tex_cube_v4s32_f32: 2888 return NVPTXISD::TexCubeS32Float; 2889 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 2890 return NVPTXISD::TexCubeS32FloatLevel; 2891 case Intrinsic::nvvm_tex_cube_v4u32_f32: 2892 return NVPTXISD::TexCubeU32Float; 2893 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 2894 return NVPTXISD::TexCubeU32FloatLevel; 2895 2896 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 2897 return NVPTXISD::TexCubeArrayFloatFloat; 2898 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 2899 return NVPTXISD::TexCubeArrayFloatFloatLevel; 2900 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 2901 return NVPTXISD::TexCubeArrayS32Float; 2902 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 2903 return NVPTXISD::TexCubeArrayS32FloatLevel; 2904 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 2905 return NVPTXISD::TexCubeArrayU32Float; 2906 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 2907 return NVPTXISD::TexCubeArrayU32FloatLevel; 2908 2909 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 2910 return NVPTXISD::Tld4R2DFloatFloat; 2911 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 2912 return NVPTXISD::Tld4G2DFloatFloat; 2913 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 2914 return NVPTXISD::Tld4B2DFloatFloat; 2915 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 2916 return NVPTXISD::Tld4A2DFloatFloat; 2917 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 2918 return NVPTXISD::Tld4R2DS64Float; 2919 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 2920 return NVPTXISD::Tld4G2DS64Float; 2921 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 2922 return NVPTXISD::Tld4B2DS64Float; 2923 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 2924 return NVPTXISD::Tld4A2DS64Float; 2925 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 2926 return NVPTXISD::Tld4R2DU64Float; 2927 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 2928 return NVPTXISD::Tld4G2DU64Float; 2929 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 2930 return NVPTXISD::Tld4B2DU64Float; 2931 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 2932 return NVPTXISD::Tld4A2DU64Float; 2933 2934 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 2935 return NVPTXISD::TexUnified1DFloatS32; 2936 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 2937 return NVPTXISD::TexUnified1DFloatFloat; 2938 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 2939 return NVPTXISD::TexUnified1DFloatFloatLevel; 2940 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 2941 return NVPTXISD::TexUnified1DFloatFloatGrad; 2942 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 2943 return NVPTXISD::TexUnified1DS32S32; 2944 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 2945 return NVPTXISD::TexUnified1DS32Float; 2946 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 2947 return NVPTXISD::TexUnified1DS32FloatLevel; 2948 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 2949 return NVPTXISD::TexUnified1DS32FloatGrad; 2950 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 2951 return NVPTXISD::TexUnified1DU32S32; 2952 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 2953 return NVPTXISD::TexUnified1DU32Float; 2954 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 2955 return NVPTXISD::TexUnified1DU32FloatLevel; 2956 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 2957 return NVPTXISD::TexUnified1DU32FloatGrad; 2958 2959 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 2960 return NVPTXISD::TexUnified1DArrayFloatS32; 2961 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 2962 return NVPTXISD::TexUnified1DArrayFloatFloat; 2963 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 2964 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 2965 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 2966 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 2967 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 2968 return NVPTXISD::TexUnified1DArrayS32S32; 2969 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 2970 return NVPTXISD::TexUnified1DArrayS32Float; 2971 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 2972 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 2973 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 2974 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 2975 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 2976 return NVPTXISD::TexUnified1DArrayU32S32; 2977 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 2978 return NVPTXISD::TexUnified1DArrayU32Float; 2979 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 2980 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 2981 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 2982 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 2983 2984 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 2985 return NVPTXISD::TexUnified2DFloatS32; 2986 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 2987 return NVPTXISD::TexUnified2DFloatFloat; 2988 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 2989 return NVPTXISD::TexUnified2DFloatFloatLevel; 2990 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 2991 return NVPTXISD::TexUnified2DFloatFloatGrad; 2992 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 2993 return NVPTXISD::TexUnified2DS32S32; 2994 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 2995 return NVPTXISD::TexUnified2DS32Float; 2996 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 2997 return NVPTXISD::TexUnified2DS32FloatLevel; 2998 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 2999 return NVPTXISD::TexUnified2DS32FloatGrad; 3000 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3001 return NVPTXISD::TexUnified2DU32S32; 3002 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3003 return NVPTXISD::TexUnified2DU32Float; 3004 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3005 return NVPTXISD::TexUnified2DU32FloatLevel; 3006 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3007 return NVPTXISD::TexUnified2DU32FloatGrad; 3008 3009 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3010 return NVPTXISD::TexUnified2DArrayFloatS32; 3011 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3012 return NVPTXISD::TexUnified2DArrayFloatFloat; 3013 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3014 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3015 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3016 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3017 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3018 return NVPTXISD::TexUnified2DArrayS32S32; 3019 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3020 return NVPTXISD::TexUnified2DArrayS32Float; 3021 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3022 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3023 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3024 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3025 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3026 return NVPTXISD::TexUnified2DArrayU32S32; 3027 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3028 return NVPTXISD::TexUnified2DArrayU32Float; 3029 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3030 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3031 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3032 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3033 3034 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3035 return NVPTXISD::TexUnified3DFloatS32; 3036 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3037 return NVPTXISD::TexUnified3DFloatFloat; 3038 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3039 return NVPTXISD::TexUnified3DFloatFloatLevel; 3040 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3041 return NVPTXISD::TexUnified3DFloatFloatGrad; 3042 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3043 return NVPTXISD::TexUnified3DS32S32; 3044 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3045 return NVPTXISD::TexUnified3DS32Float; 3046 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3047 return NVPTXISD::TexUnified3DS32FloatLevel; 3048 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3049 return NVPTXISD::TexUnified3DS32FloatGrad; 3050 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3051 return NVPTXISD::TexUnified3DU32S32; 3052 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3053 return NVPTXISD::TexUnified3DU32Float; 3054 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3055 return NVPTXISD::TexUnified3DU32FloatLevel; 3056 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3057 return NVPTXISD::TexUnified3DU32FloatGrad; 3058 3059 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3060 return NVPTXISD::TexUnifiedCubeFloatFloat; 3061 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3062 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3063 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3064 return NVPTXISD::TexUnifiedCubeS32Float; 3065 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3066 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3067 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3068 return NVPTXISD::TexUnifiedCubeU32Float; 3069 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3070 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3071 3072 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3073 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3074 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3075 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3076 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3077 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3078 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3079 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3080 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3081 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3082 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3083 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3084 3085 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3086 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3087 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3088 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3089 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3090 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3091 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3092 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3093 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3094 return NVPTXISD::Tld4UnifiedR2DS64Float; 3095 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3096 return NVPTXISD::Tld4UnifiedG2DS64Float; 3097 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3098 return NVPTXISD::Tld4UnifiedB2DS64Float; 3099 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3100 return NVPTXISD::Tld4UnifiedA2DS64Float; 3101 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3102 return NVPTXISD::Tld4UnifiedR2DU64Float; 3103 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3104 return NVPTXISD::Tld4UnifiedG2DU64Float; 3105 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3106 return NVPTXISD::Tld4UnifiedB2DU64Float; 3107 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3108 return NVPTXISD::Tld4UnifiedA2DU64Float; 3109 } 3110 } 3111 3112 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3113 switch (Intrinsic) { 3114 default: 3115 return 0; 3116 case Intrinsic::nvvm_suld_1d_i8_clamp: 3117 return NVPTXISD::Suld1DI8Clamp; 3118 case Intrinsic::nvvm_suld_1d_i16_clamp: 3119 return NVPTXISD::Suld1DI16Clamp; 3120 case Intrinsic::nvvm_suld_1d_i32_clamp: 3121 return NVPTXISD::Suld1DI32Clamp; 3122 case Intrinsic::nvvm_suld_1d_i64_clamp: 3123 return NVPTXISD::Suld1DI64Clamp; 3124 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3125 return NVPTXISD::Suld1DV2I8Clamp; 3126 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3127 return NVPTXISD::Suld1DV2I16Clamp; 3128 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3129 return NVPTXISD::Suld1DV2I32Clamp; 3130 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3131 return NVPTXISD::Suld1DV2I64Clamp; 3132 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3133 return NVPTXISD::Suld1DV4I8Clamp; 3134 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3135 return NVPTXISD::Suld1DV4I16Clamp; 3136 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3137 return NVPTXISD::Suld1DV4I32Clamp; 3138 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3139 return NVPTXISD::Suld1DArrayI8Clamp; 3140 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3141 return NVPTXISD::Suld1DArrayI16Clamp; 3142 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3143 return NVPTXISD::Suld1DArrayI32Clamp; 3144 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3145 return NVPTXISD::Suld1DArrayI64Clamp; 3146 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3147 return NVPTXISD::Suld1DArrayV2I8Clamp; 3148 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3149 return NVPTXISD::Suld1DArrayV2I16Clamp; 3150 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3151 return NVPTXISD::Suld1DArrayV2I32Clamp; 3152 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3153 return NVPTXISD::Suld1DArrayV2I64Clamp; 3154 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3155 return NVPTXISD::Suld1DArrayV4I8Clamp; 3156 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3157 return NVPTXISD::Suld1DArrayV4I16Clamp; 3158 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3159 return NVPTXISD::Suld1DArrayV4I32Clamp; 3160 case Intrinsic::nvvm_suld_2d_i8_clamp: 3161 return NVPTXISD::Suld2DI8Clamp; 3162 case Intrinsic::nvvm_suld_2d_i16_clamp: 3163 return NVPTXISD::Suld2DI16Clamp; 3164 case Intrinsic::nvvm_suld_2d_i32_clamp: 3165 return NVPTXISD::Suld2DI32Clamp; 3166 case Intrinsic::nvvm_suld_2d_i64_clamp: 3167 return NVPTXISD::Suld2DI64Clamp; 3168 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3169 return NVPTXISD::Suld2DV2I8Clamp; 3170 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3171 return NVPTXISD::Suld2DV2I16Clamp; 3172 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3173 return NVPTXISD::Suld2DV2I32Clamp; 3174 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3175 return NVPTXISD::Suld2DV2I64Clamp; 3176 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3177 return NVPTXISD::Suld2DV4I8Clamp; 3178 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3179 return NVPTXISD::Suld2DV4I16Clamp; 3180 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3181 return NVPTXISD::Suld2DV4I32Clamp; 3182 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3183 return NVPTXISD::Suld2DArrayI8Clamp; 3184 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3185 return NVPTXISD::Suld2DArrayI16Clamp; 3186 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3187 return NVPTXISD::Suld2DArrayI32Clamp; 3188 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3189 return NVPTXISD::Suld2DArrayI64Clamp; 3190 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3191 return NVPTXISD::Suld2DArrayV2I8Clamp; 3192 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3193 return NVPTXISD::Suld2DArrayV2I16Clamp; 3194 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3195 return NVPTXISD::Suld2DArrayV2I32Clamp; 3196 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3197 return NVPTXISD::Suld2DArrayV2I64Clamp; 3198 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3199 return NVPTXISD::Suld2DArrayV4I8Clamp; 3200 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3201 return NVPTXISD::Suld2DArrayV4I16Clamp; 3202 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3203 return NVPTXISD::Suld2DArrayV4I32Clamp; 3204 case Intrinsic::nvvm_suld_3d_i8_clamp: 3205 return NVPTXISD::Suld3DI8Clamp; 3206 case Intrinsic::nvvm_suld_3d_i16_clamp: 3207 return NVPTXISD::Suld3DI16Clamp; 3208 case Intrinsic::nvvm_suld_3d_i32_clamp: 3209 return NVPTXISD::Suld3DI32Clamp; 3210 case Intrinsic::nvvm_suld_3d_i64_clamp: 3211 return NVPTXISD::Suld3DI64Clamp; 3212 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3213 return NVPTXISD::Suld3DV2I8Clamp; 3214 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3215 return NVPTXISD::Suld3DV2I16Clamp; 3216 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3217 return NVPTXISD::Suld3DV2I32Clamp; 3218 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3219 return NVPTXISD::Suld3DV2I64Clamp; 3220 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3221 return NVPTXISD::Suld3DV4I8Clamp; 3222 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3223 return NVPTXISD::Suld3DV4I16Clamp; 3224 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3225 return NVPTXISD::Suld3DV4I32Clamp; 3226 case Intrinsic::nvvm_suld_1d_i8_trap: 3227 return NVPTXISD::Suld1DI8Trap; 3228 case Intrinsic::nvvm_suld_1d_i16_trap: 3229 return NVPTXISD::Suld1DI16Trap; 3230 case Intrinsic::nvvm_suld_1d_i32_trap: 3231 return NVPTXISD::Suld1DI32Trap; 3232 case Intrinsic::nvvm_suld_1d_i64_trap: 3233 return NVPTXISD::Suld1DI64Trap; 3234 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3235 return NVPTXISD::Suld1DV2I8Trap; 3236 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3237 return NVPTXISD::Suld1DV2I16Trap; 3238 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3239 return NVPTXISD::Suld1DV2I32Trap; 3240 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3241 return NVPTXISD::Suld1DV2I64Trap; 3242 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3243 return NVPTXISD::Suld1DV4I8Trap; 3244 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3245 return NVPTXISD::Suld1DV4I16Trap; 3246 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3247 return NVPTXISD::Suld1DV4I32Trap; 3248 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3249 return NVPTXISD::Suld1DArrayI8Trap; 3250 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3251 return NVPTXISD::Suld1DArrayI16Trap; 3252 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3253 return NVPTXISD::Suld1DArrayI32Trap; 3254 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3255 return NVPTXISD::Suld1DArrayI64Trap; 3256 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3257 return NVPTXISD::Suld1DArrayV2I8Trap; 3258 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3259 return NVPTXISD::Suld1DArrayV2I16Trap; 3260 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3261 return NVPTXISD::Suld1DArrayV2I32Trap; 3262 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3263 return NVPTXISD::Suld1DArrayV2I64Trap; 3264 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3265 return NVPTXISD::Suld1DArrayV4I8Trap; 3266 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3267 return NVPTXISD::Suld1DArrayV4I16Trap; 3268 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3269 return NVPTXISD::Suld1DArrayV4I32Trap; 3270 case Intrinsic::nvvm_suld_2d_i8_trap: 3271 return NVPTXISD::Suld2DI8Trap; 3272 case Intrinsic::nvvm_suld_2d_i16_trap: 3273 return NVPTXISD::Suld2DI16Trap; 3274 case Intrinsic::nvvm_suld_2d_i32_trap: 3275 return NVPTXISD::Suld2DI32Trap; 3276 case Intrinsic::nvvm_suld_2d_i64_trap: 3277 return NVPTXISD::Suld2DI64Trap; 3278 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3279 return NVPTXISD::Suld2DV2I8Trap; 3280 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3281 return NVPTXISD::Suld2DV2I16Trap; 3282 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3283 return NVPTXISD::Suld2DV2I32Trap; 3284 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3285 return NVPTXISD::Suld2DV2I64Trap; 3286 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3287 return NVPTXISD::Suld2DV4I8Trap; 3288 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3289 return NVPTXISD::Suld2DV4I16Trap; 3290 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3291 return NVPTXISD::Suld2DV4I32Trap; 3292 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3293 return NVPTXISD::Suld2DArrayI8Trap; 3294 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3295 return NVPTXISD::Suld2DArrayI16Trap; 3296 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3297 return NVPTXISD::Suld2DArrayI32Trap; 3298 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3299 return NVPTXISD::Suld2DArrayI64Trap; 3300 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3301 return NVPTXISD::Suld2DArrayV2I8Trap; 3302 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3303 return NVPTXISD::Suld2DArrayV2I16Trap; 3304 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3305 return NVPTXISD::Suld2DArrayV2I32Trap; 3306 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3307 return NVPTXISD::Suld2DArrayV2I64Trap; 3308 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3309 return NVPTXISD::Suld2DArrayV4I8Trap; 3310 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3311 return NVPTXISD::Suld2DArrayV4I16Trap; 3312 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3313 return NVPTXISD::Suld2DArrayV4I32Trap; 3314 case Intrinsic::nvvm_suld_3d_i8_trap: 3315 return NVPTXISD::Suld3DI8Trap; 3316 case Intrinsic::nvvm_suld_3d_i16_trap: 3317 return NVPTXISD::Suld3DI16Trap; 3318 case Intrinsic::nvvm_suld_3d_i32_trap: 3319 return NVPTXISD::Suld3DI32Trap; 3320 case Intrinsic::nvvm_suld_3d_i64_trap: 3321 return NVPTXISD::Suld3DI64Trap; 3322 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3323 return NVPTXISD::Suld3DV2I8Trap; 3324 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3325 return NVPTXISD::Suld3DV2I16Trap; 3326 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3327 return NVPTXISD::Suld3DV2I32Trap; 3328 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3329 return NVPTXISD::Suld3DV2I64Trap; 3330 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3331 return NVPTXISD::Suld3DV4I8Trap; 3332 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3333 return NVPTXISD::Suld3DV4I16Trap; 3334 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3335 return NVPTXISD::Suld3DV4I32Trap; 3336 case Intrinsic::nvvm_suld_1d_i8_zero: 3337 return NVPTXISD::Suld1DI8Zero; 3338 case Intrinsic::nvvm_suld_1d_i16_zero: 3339 return NVPTXISD::Suld1DI16Zero; 3340 case Intrinsic::nvvm_suld_1d_i32_zero: 3341 return NVPTXISD::Suld1DI32Zero; 3342 case Intrinsic::nvvm_suld_1d_i64_zero: 3343 return NVPTXISD::Suld1DI64Zero; 3344 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3345 return NVPTXISD::Suld1DV2I8Zero; 3346 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3347 return NVPTXISD::Suld1DV2I16Zero; 3348 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3349 return NVPTXISD::Suld1DV2I32Zero; 3350 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3351 return NVPTXISD::Suld1DV2I64Zero; 3352 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3353 return NVPTXISD::Suld1DV4I8Zero; 3354 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3355 return NVPTXISD::Suld1DV4I16Zero; 3356 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3357 return NVPTXISD::Suld1DV4I32Zero; 3358 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3359 return NVPTXISD::Suld1DArrayI8Zero; 3360 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3361 return NVPTXISD::Suld1DArrayI16Zero; 3362 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3363 return NVPTXISD::Suld1DArrayI32Zero; 3364 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3365 return NVPTXISD::Suld1DArrayI64Zero; 3366 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3367 return NVPTXISD::Suld1DArrayV2I8Zero; 3368 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3369 return NVPTXISD::Suld1DArrayV2I16Zero; 3370 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3371 return NVPTXISD::Suld1DArrayV2I32Zero; 3372 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3373 return NVPTXISD::Suld1DArrayV2I64Zero; 3374 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3375 return NVPTXISD::Suld1DArrayV4I8Zero; 3376 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3377 return NVPTXISD::Suld1DArrayV4I16Zero; 3378 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3379 return NVPTXISD::Suld1DArrayV4I32Zero; 3380 case Intrinsic::nvvm_suld_2d_i8_zero: 3381 return NVPTXISD::Suld2DI8Zero; 3382 case Intrinsic::nvvm_suld_2d_i16_zero: 3383 return NVPTXISD::Suld2DI16Zero; 3384 case Intrinsic::nvvm_suld_2d_i32_zero: 3385 return NVPTXISD::Suld2DI32Zero; 3386 case Intrinsic::nvvm_suld_2d_i64_zero: 3387 return NVPTXISD::Suld2DI64Zero; 3388 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3389 return NVPTXISD::Suld2DV2I8Zero; 3390 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3391 return NVPTXISD::Suld2DV2I16Zero; 3392 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3393 return NVPTXISD::Suld2DV2I32Zero; 3394 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3395 return NVPTXISD::Suld2DV2I64Zero; 3396 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3397 return NVPTXISD::Suld2DV4I8Zero; 3398 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3399 return NVPTXISD::Suld2DV4I16Zero; 3400 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3401 return NVPTXISD::Suld2DV4I32Zero; 3402 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3403 return NVPTXISD::Suld2DArrayI8Zero; 3404 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3405 return NVPTXISD::Suld2DArrayI16Zero; 3406 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3407 return NVPTXISD::Suld2DArrayI32Zero; 3408 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3409 return NVPTXISD::Suld2DArrayI64Zero; 3410 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3411 return NVPTXISD::Suld2DArrayV2I8Zero; 3412 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3413 return NVPTXISD::Suld2DArrayV2I16Zero; 3414 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3415 return NVPTXISD::Suld2DArrayV2I32Zero; 3416 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3417 return NVPTXISD::Suld2DArrayV2I64Zero; 3418 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3419 return NVPTXISD::Suld2DArrayV4I8Zero; 3420 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3421 return NVPTXISD::Suld2DArrayV4I16Zero; 3422 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3423 return NVPTXISD::Suld2DArrayV4I32Zero; 3424 case Intrinsic::nvvm_suld_3d_i8_zero: 3425 return NVPTXISD::Suld3DI8Zero; 3426 case Intrinsic::nvvm_suld_3d_i16_zero: 3427 return NVPTXISD::Suld3DI16Zero; 3428 case Intrinsic::nvvm_suld_3d_i32_zero: 3429 return NVPTXISD::Suld3DI32Zero; 3430 case Intrinsic::nvvm_suld_3d_i64_zero: 3431 return NVPTXISD::Suld3DI64Zero; 3432 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3433 return NVPTXISD::Suld3DV2I8Zero; 3434 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3435 return NVPTXISD::Suld3DV2I16Zero; 3436 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3437 return NVPTXISD::Suld3DV2I32Zero; 3438 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3439 return NVPTXISD::Suld3DV2I64Zero; 3440 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3441 return NVPTXISD::Suld3DV4I8Zero; 3442 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3443 return NVPTXISD::Suld3DV4I16Zero; 3444 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3445 return NVPTXISD::Suld3DV4I32Zero; 3446 } 3447 } 3448 3449 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3450 // TgtMemIntrinsic 3451 // because we need the information that is only available in the "Value" type 3452 // of destination 3453 // pointer. In particular, the address space information. 3454 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3455 IntrinsicInfo &Info, const CallInst &I, 3456 MachineFunction &MF, unsigned Intrinsic) const { 3457 switch (Intrinsic) { 3458 default: 3459 return false; 3460 case Intrinsic::nvvm_match_all_sync_i32p: 3461 case Intrinsic::nvvm_match_all_sync_i64p: 3462 Info.opc = ISD::INTRINSIC_W_CHAIN; 3463 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3464 // in order to model data exchange with other threads, but perform no real 3465 // memory accesses. 3466 Info.memVT = MVT::i1; 3467 3468 // Our result depends on both our and other thread's arguments. 3469 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3470 return true; 3471 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3472 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3473 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3474 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3475 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3476 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3477 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3478 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3479 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3480 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3481 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3482 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3483 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3484 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3485 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3486 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3487 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3488 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3489 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3490 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3491 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3492 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3493 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3494 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3495 Info.opc = ISD::INTRINSIC_W_CHAIN; 3496 Info.memVT = MVT::v8f16; 3497 Info.ptrVal = I.getArgOperand(0); 3498 Info.offset = 0; 3499 Info.flags = MachineMemOperand::MOLoad; 3500 Info.align = Align(16); 3501 return true; 3502 } 3503 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3505 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3506 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3507 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3508 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3509 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3510 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3511 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3512 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3513 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3514 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3515 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3516 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3517 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3518 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: { 3519 Info.opc = ISD::INTRINSIC_W_CHAIN; 3520 Info.memVT = MVT::v2i32; 3521 Info.ptrVal = I.getArgOperand(0); 3522 Info.offset = 0; 3523 Info.flags = MachineMemOperand::MOLoad; 3524 Info.align = Align(8); 3525 return true; 3526 } 3527 3528 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3529 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3530 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3531 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3532 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3533 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3534 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3535 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3536 3537 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3538 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3539 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3540 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3541 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3542 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3543 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3544 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: { 3545 Info.opc = ISD::INTRINSIC_W_CHAIN; 3546 Info.memVT = MVT::v4i32; 3547 Info.ptrVal = I.getArgOperand(0); 3548 Info.offset = 0; 3549 Info.flags = MachineMemOperand::MOLoad; 3550 Info.align = Align(16); 3551 return true; 3552 } 3553 3554 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3555 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3556 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3557 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3558 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3559 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3560 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3561 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3562 3563 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3564 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3565 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3566 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3567 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3568 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3569 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3570 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3571 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3572 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3573 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3574 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3575 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3576 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3577 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3578 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3579 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3580 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3581 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3582 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: { 3583 Info.opc = ISD::INTRINSIC_W_CHAIN; 3584 Info.memVT = MVT::i32; 3585 Info.ptrVal = I.getArgOperand(0); 3586 Info.offset = 0; 3587 Info.flags = MachineMemOperand::MOLoad; 3588 Info.align = Align(4); 3589 return true; 3590 } 3591 3592 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3593 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3594 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3595 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3596 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3597 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3598 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3599 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3600 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3601 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3602 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3603 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3604 Info.opc = ISD::INTRINSIC_W_CHAIN; 3605 Info.memVT = MVT::v4f16; 3606 Info.ptrVal = I.getArgOperand(0); 3607 Info.offset = 0; 3608 Info.flags = MachineMemOperand::MOLoad; 3609 Info.align = Align(16); 3610 return true; 3611 } 3612 3613 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3614 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3615 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3617 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3618 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3619 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3621 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3622 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3623 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: { 3625 Info.opc = ISD::INTRINSIC_W_CHAIN; 3626 Info.memVT = MVT::v8f32; 3627 Info.ptrVal = I.getArgOperand(0); 3628 Info.offset = 0; 3629 Info.flags = MachineMemOperand::MOLoad; 3630 Info.align = Align(16); 3631 return true; 3632 } 3633 3634 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3635 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3636 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3637 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3638 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3639 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3640 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3641 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3642 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3643 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3644 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3645 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3646 Info.opc = ISD::INTRINSIC_W_CHAIN; 3647 Info.memVT = MVT::v8i32; 3648 Info.ptrVal = I.getArgOperand(0); 3649 Info.offset = 0; 3650 Info.flags = MachineMemOperand::MOLoad; 3651 Info.align = Align(16); 3652 return true; 3653 } 3654 3655 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3656 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3657 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3658 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3659 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3660 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3661 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3662 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: { 3663 Info.opc = ISD::INTRINSIC_W_CHAIN; 3664 Info.memVT = MVT::v2i32; 3665 Info.ptrVal = I.getArgOperand(0); 3666 Info.offset = 0; 3667 Info.flags = MachineMemOperand::MOLoad; 3668 Info.align = Align(8); 3669 return true; 3670 } 3671 3672 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3673 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3674 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3675 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3676 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3677 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3678 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3679 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3680 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3681 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3682 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3683 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3684 Info.opc = ISD::INTRINSIC_VOID; 3685 Info.memVT = MVT::v4f16; 3686 Info.ptrVal = I.getArgOperand(0); 3687 Info.offset = 0; 3688 Info.flags = MachineMemOperand::MOStore; 3689 Info.align = Align(16); 3690 return true; 3691 } 3692 3693 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3694 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3695 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3697 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3698 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3699 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3701 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3702 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3703 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: { 3705 Info.opc = ISD::INTRINSIC_VOID; 3706 Info.memVT = MVT::v8f32; 3707 Info.ptrVal = I.getArgOperand(0); 3708 Info.offset = 0; 3709 Info.flags = MachineMemOperand::MOStore; 3710 Info.align = Align(16); 3711 return true; 3712 } 3713 3714 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3715 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3716 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3717 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3718 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3719 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3720 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3721 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3722 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3723 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3724 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3725 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3726 Info.opc = ISD::INTRINSIC_VOID; 3727 Info.memVT = MVT::v8i32; 3728 Info.ptrVal = I.getArgOperand(0); 3729 Info.offset = 0; 3730 Info.flags = MachineMemOperand::MOStore; 3731 Info.align = Align(16); 3732 return true; 3733 } 3734 3735 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3736 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3737 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3738 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3739 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3740 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3741 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3742 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3743 Info.opc = ISD::INTRINSIC_VOID; 3744 Info.memVT = MVT::v2i32; 3745 Info.ptrVal = I.getArgOperand(0); 3746 Info.offset = 0; 3747 Info.flags = MachineMemOperand::MOStore; 3748 Info.align = Align(8); 3749 return true; 3750 } 3751 3752 case Intrinsic::nvvm_atomic_load_inc_32: 3753 case Intrinsic::nvvm_atomic_load_dec_32: 3754 3755 case Intrinsic::nvvm_atomic_add_gen_f_cta: 3756 case Intrinsic::nvvm_atomic_add_gen_f_sys: 3757 case Intrinsic::nvvm_atomic_add_gen_i_cta: 3758 case Intrinsic::nvvm_atomic_add_gen_i_sys: 3759 case Intrinsic::nvvm_atomic_and_gen_i_cta: 3760 case Intrinsic::nvvm_atomic_and_gen_i_sys: 3761 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 3762 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 3763 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 3764 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 3765 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 3766 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 3767 case Intrinsic::nvvm_atomic_max_gen_i_cta: 3768 case Intrinsic::nvvm_atomic_max_gen_i_sys: 3769 case Intrinsic::nvvm_atomic_min_gen_i_cta: 3770 case Intrinsic::nvvm_atomic_min_gen_i_sys: 3771 case Intrinsic::nvvm_atomic_or_gen_i_cta: 3772 case Intrinsic::nvvm_atomic_or_gen_i_sys: 3773 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 3774 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 3775 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 3776 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 3777 auto &DL = I.getModule()->getDataLayout(); 3778 Info.opc = ISD::INTRINSIC_W_CHAIN; 3779 Info.memVT = getValueType(DL, I.getType()); 3780 Info.ptrVal = I.getArgOperand(0); 3781 Info.offset = 0; 3782 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3783 Info.align.reset(); 3784 return true; 3785 } 3786 3787 case Intrinsic::nvvm_ldu_global_i: 3788 case Intrinsic::nvvm_ldu_global_f: 3789 case Intrinsic::nvvm_ldu_global_p: { 3790 auto &DL = I.getModule()->getDataLayout(); 3791 Info.opc = ISD::INTRINSIC_W_CHAIN; 3792 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 3793 Info.memVT = getValueType(DL, I.getType()); 3794 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 3795 Info.memVT = getPointerTy(DL); 3796 else 3797 Info.memVT = getValueType(DL, I.getType()); 3798 Info.ptrVal = I.getArgOperand(0); 3799 Info.offset = 0; 3800 Info.flags = MachineMemOperand::MOLoad; 3801 Info.align = 3802 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); 3803 3804 return true; 3805 } 3806 case Intrinsic::nvvm_ldg_global_i: 3807 case Intrinsic::nvvm_ldg_global_f: 3808 case Intrinsic::nvvm_ldg_global_p: { 3809 auto &DL = I.getModule()->getDataLayout(); 3810 3811 Info.opc = ISD::INTRINSIC_W_CHAIN; 3812 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 3813 Info.memVT = getValueType(DL, I.getType()); 3814 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 3815 Info.memVT = getPointerTy(DL); 3816 else 3817 Info.memVT = getValueType(DL, I.getType()); 3818 Info.ptrVal = I.getArgOperand(0); 3819 Info.offset = 0; 3820 Info.flags = MachineMemOperand::MOLoad; 3821 Info.align = 3822 MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); 3823 3824 return true; 3825 } 3826 3827 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3828 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3829 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3830 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3831 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3832 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3833 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3834 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3835 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3836 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3837 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3838 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3839 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3840 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3841 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3842 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3843 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3844 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3845 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3846 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3847 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3848 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3849 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3850 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3851 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3852 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3853 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3854 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3855 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3856 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3857 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3858 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3859 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3860 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3861 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3862 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3863 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3864 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3865 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3866 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3867 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3868 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3869 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3870 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3871 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3872 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3873 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3874 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3875 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3876 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3877 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3878 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3879 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3880 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3881 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3882 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3883 Info.opc = getOpcForTextureInstr(Intrinsic); 3884 Info.memVT = MVT::v4f32; 3885 Info.ptrVal = nullptr; 3886 Info.offset = 0; 3887 Info.flags = MachineMemOperand::MOLoad; 3888 Info.align = Align(16); 3889 return true; 3890 3891 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3892 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3893 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3894 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3895 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3896 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3897 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3898 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3899 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3900 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3901 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3902 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3903 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3904 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3905 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3906 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3907 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3908 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3909 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3910 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3911 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3912 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3913 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3914 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3915 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3916 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3917 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3918 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3919 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3920 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3921 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3922 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3923 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3924 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3925 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3926 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3927 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3928 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3929 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3930 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3931 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3932 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3933 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3934 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3935 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3936 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3937 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3938 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3939 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3940 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3941 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3942 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3943 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3944 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3945 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3946 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3947 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3948 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3949 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3950 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3951 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3952 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3953 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3954 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3955 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3956 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3957 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3958 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3959 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3960 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3961 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3962 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3963 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3964 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3965 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3966 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3967 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3968 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3969 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3970 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3971 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3972 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3973 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3974 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3975 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3976 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3977 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3978 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3979 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3980 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3981 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3982 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3983 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3984 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3985 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3986 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3987 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3988 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3989 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3990 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3991 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3992 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3993 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3994 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3995 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3996 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3997 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3998 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3999 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4000 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4001 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4002 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4003 Info.opc = getOpcForTextureInstr(Intrinsic); 4004 Info.memVT = MVT::v4i32; 4005 Info.ptrVal = nullptr; 4006 Info.offset = 0; 4007 Info.flags = MachineMemOperand::MOLoad; 4008 Info.align = Align(16); 4009 return true; 4010 4011 case Intrinsic::nvvm_suld_1d_i8_clamp: 4012 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4013 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4014 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4015 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4016 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4017 case Intrinsic::nvvm_suld_2d_i8_clamp: 4018 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4019 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4020 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4021 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4022 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4023 case Intrinsic::nvvm_suld_3d_i8_clamp: 4024 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4025 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4026 case Intrinsic::nvvm_suld_1d_i8_trap: 4027 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4028 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4029 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4030 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4031 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4032 case Intrinsic::nvvm_suld_2d_i8_trap: 4033 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4034 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4035 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4036 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4037 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4038 case Intrinsic::nvvm_suld_3d_i8_trap: 4039 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4040 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4041 case Intrinsic::nvvm_suld_1d_i8_zero: 4042 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4043 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4044 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4045 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4046 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4047 case Intrinsic::nvvm_suld_2d_i8_zero: 4048 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4049 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4050 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4051 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4052 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4053 case Intrinsic::nvvm_suld_3d_i8_zero: 4054 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4055 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4056 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4057 Info.memVT = MVT::i8; 4058 Info.ptrVal = nullptr; 4059 Info.offset = 0; 4060 Info.flags = MachineMemOperand::MOLoad; 4061 Info.align = Align(16); 4062 return true; 4063 4064 case Intrinsic::nvvm_suld_1d_i16_clamp: 4065 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4066 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4067 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4068 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4069 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4070 case Intrinsic::nvvm_suld_2d_i16_clamp: 4071 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4072 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4073 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4074 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4075 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4076 case Intrinsic::nvvm_suld_3d_i16_clamp: 4077 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4078 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4079 case Intrinsic::nvvm_suld_1d_i16_trap: 4080 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4081 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4082 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4083 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4084 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4085 case Intrinsic::nvvm_suld_2d_i16_trap: 4086 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4087 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4088 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4089 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4090 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4091 case Intrinsic::nvvm_suld_3d_i16_trap: 4092 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4093 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4094 case Intrinsic::nvvm_suld_1d_i16_zero: 4095 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4096 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4097 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4098 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4099 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4100 case Intrinsic::nvvm_suld_2d_i16_zero: 4101 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4102 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4103 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4104 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4105 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4106 case Intrinsic::nvvm_suld_3d_i16_zero: 4107 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4108 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4109 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4110 Info.memVT = MVT::i16; 4111 Info.ptrVal = nullptr; 4112 Info.offset = 0; 4113 Info.flags = MachineMemOperand::MOLoad; 4114 Info.align = Align(16); 4115 return true; 4116 4117 case Intrinsic::nvvm_suld_1d_i32_clamp: 4118 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4119 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4120 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4121 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4122 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4123 case Intrinsic::nvvm_suld_2d_i32_clamp: 4124 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4125 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4126 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4127 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4128 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4129 case Intrinsic::nvvm_suld_3d_i32_clamp: 4130 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4131 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4132 case Intrinsic::nvvm_suld_1d_i32_trap: 4133 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4134 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4135 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4136 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4137 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4138 case Intrinsic::nvvm_suld_2d_i32_trap: 4139 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4140 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4141 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4142 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4143 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4144 case Intrinsic::nvvm_suld_3d_i32_trap: 4145 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4146 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4147 case Intrinsic::nvvm_suld_1d_i32_zero: 4148 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4149 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4150 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4151 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4152 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4153 case Intrinsic::nvvm_suld_2d_i32_zero: 4154 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4155 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4156 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4157 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4158 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4159 case Intrinsic::nvvm_suld_3d_i32_zero: 4160 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4161 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4162 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4163 Info.memVT = MVT::i32; 4164 Info.ptrVal = nullptr; 4165 Info.offset = 0; 4166 Info.flags = MachineMemOperand::MOLoad; 4167 Info.align = Align(16); 4168 return true; 4169 4170 case Intrinsic::nvvm_suld_1d_i64_clamp: 4171 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4172 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4173 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4174 case Intrinsic::nvvm_suld_2d_i64_clamp: 4175 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4176 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4177 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4178 case Intrinsic::nvvm_suld_3d_i64_clamp: 4179 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4180 case Intrinsic::nvvm_suld_1d_i64_trap: 4181 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4182 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4183 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4184 case Intrinsic::nvvm_suld_2d_i64_trap: 4185 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4186 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4187 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4188 case Intrinsic::nvvm_suld_3d_i64_trap: 4189 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4190 case Intrinsic::nvvm_suld_1d_i64_zero: 4191 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4192 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4193 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4194 case Intrinsic::nvvm_suld_2d_i64_zero: 4195 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4196 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4197 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4198 case Intrinsic::nvvm_suld_3d_i64_zero: 4199 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4200 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4201 Info.memVT = MVT::i64; 4202 Info.ptrVal = nullptr; 4203 Info.offset = 0; 4204 Info.flags = MachineMemOperand::MOLoad; 4205 Info.align = Align(16); 4206 return true; 4207 } 4208 return false; 4209 } 4210 4211 /// isLegalAddressingMode - Return true if the addressing mode represented 4212 /// by AM is legal for this target, for a load/store of the specified type. 4213 /// Used to guide target specific optimizations, like loop strength reduction 4214 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4215 /// (CodeGenPrepare.cpp) 4216 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4217 const AddrMode &AM, Type *Ty, 4218 unsigned AS, Instruction *I) const { 4219 // AddrMode - This represents an addressing mode of: 4220 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4221 // 4222 // The legal address modes are 4223 // - [avar] 4224 // - [areg] 4225 // - [areg+immoff] 4226 // - [immAddr] 4227 4228 if (AM.BaseGV) { 4229 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4230 } 4231 4232 switch (AM.Scale) { 4233 case 0: // "r", "r+i" or "i" is allowed 4234 break; 4235 case 1: 4236 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4237 return false; 4238 // Otherwise we have r+i. 4239 break; 4240 default: 4241 // No scale > 1 is allowed 4242 return false; 4243 } 4244 return true; 4245 } 4246 4247 //===----------------------------------------------------------------------===// 4248 // NVPTX Inline Assembly Support 4249 //===----------------------------------------------------------------------===// 4250 4251 /// getConstraintType - Given a constraint letter, return the type of 4252 /// constraint it is for this target. 4253 NVPTXTargetLowering::ConstraintType 4254 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4255 if (Constraint.size() == 1) { 4256 switch (Constraint[0]) { 4257 default: 4258 break; 4259 case 'b': 4260 case 'r': 4261 case 'h': 4262 case 'c': 4263 case 'l': 4264 case 'f': 4265 case 'd': 4266 case '0': 4267 case 'N': 4268 return C_RegisterClass; 4269 } 4270 } 4271 return TargetLowering::getConstraintType(Constraint); 4272 } 4273 4274 std::pair<unsigned, const TargetRegisterClass *> 4275 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4276 StringRef Constraint, 4277 MVT VT) const { 4278 if (Constraint.size() == 1) { 4279 switch (Constraint[0]) { 4280 case 'b': 4281 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4282 case 'c': 4283 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4284 case 'h': 4285 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4286 case 'r': 4287 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4288 case 'l': 4289 case 'N': 4290 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4291 case 'f': 4292 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4293 case 'd': 4294 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4295 } 4296 } 4297 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4298 } 4299 4300 //===----------------------------------------------------------------------===// 4301 // NVPTX DAG Combining 4302 //===----------------------------------------------------------------------===// 4303 4304 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4305 CodeGenOpt::Level OptLevel) const { 4306 // Always honor command-line argument 4307 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4308 return FMAContractLevelOpt > 0; 4309 4310 // Do not contract if we're not optimizing the code. 4311 if (OptLevel == 0) 4312 return false; 4313 4314 // Honor TargetOptions flags that explicitly say fusion is okay. 4315 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4316 return true; 4317 4318 return allowUnsafeFPMath(MF); 4319 } 4320 4321 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4322 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4323 if (MF.getTarget().Options.UnsafeFPMath) 4324 return true; 4325 4326 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4327 const Function &F = MF.getFunction(); 4328 if (F.hasFnAttribute("unsafe-fp-math")) { 4329 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 4330 StringRef Val = Attr.getValueAsString(); 4331 if (Val == "true") 4332 return true; 4333 } 4334 4335 return false; 4336 } 4337 4338 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4339 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4340 /// called with the default operands, and if that fails, with commuted 4341 /// operands. 4342 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4343 TargetLowering::DAGCombinerInfo &DCI, 4344 const NVPTXSubtarget &Subtarget, 4345 CodeGenOpt::Level OptLevel) { 4346 SelectionDAG &DAG = DCI.DAG; 4347 // Skip non-integer, non-scalar case 4348 EVT VT=N0.getValueType(); 4349 if (VT.isVector()) 4350 return SDValue(); 4351 4352 // fold (add (mul a, b), c) -> (mad a, b, c) 4353 // 4354 if (N0.getOpcode() == ISD::MUL) { 4355 assert (VT.isInteger()); 4356 // For integer: 4357 // Since integer multiply-add costs the same as integer multiply 4358 // but is more costly than integer add, do the fusion only when 4359 // the mul is only used in the add. 4360 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4361 !N0.getNode()->hasOneUse()) 4362 return SDValue(); 4363 4364 // Do the folding 4365 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4366 N0.getOperand(0), N0.getOperand(1), N1); 4367 } 4368 else if (N0.getOpcode() == ISD::FMUL) { 4369 if (VT == MVT::f32 || VT == MVT::f64) { 4370 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4371 &DAG.getTargetLoweringInfo()); 4372 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4373 return SDValue(); 4374 4375 // For floating point: 4376 // Do the fusion only when the mul has less than 5 uses and all 4377 // are add. 4378 // The heuristic is that if a use is not an add, then that use 4379 // cannot be fused into fma, therefore mul is still needed anyway. 4380 // If there are more than 4 uses, even if they are all add, fusing 4381 // them will increase register pressue. 4382 // 4383 int numUses = 0; 4384 int nonAddCount = 0; 4385 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 4386 UE = N0.getNode()->use_end(); 4387 UI != UE; ++UI) { 4388 numUses++; 4389 SDNode *User = *UI; 4390 if (User->getOpcode() != ISD::FADD) 4391 ++nonAddCount; 4392 } 4393 if (numUses >= 5) 4394 return SDValue(); 4395 if (nonAddCount) { 4396 int orderNo = N->getIROrder(); 4397 int orderNo2 = N0.getNode()->getIROrder(); 4398 // simple heuristics here for considering potential register 4399 // pressure, the logics here is that the differnce are used 4400 // to measure the distance between def and use, the longer distance 4401 // more likely cause register pressure. 4402 if (orderNo - orderNo2 < 500) 4403 return SDValue(); 4404 4405 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4406 // which guarantees that the FMA will not increase register pressure at node N. 4407 bool opIsLive = false; 4408 const SDNode *left = N0.getOperand(0).getNode(); 4409 const SDNode *right = N0.getOperand(1).getNode(); 4410 4411 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4412 opIsLive = true; 4413 4414 if (!opIsLive) 4415 for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { 4416 SDNode *User = *UI; 4417 int orderNo3 = User->getIROrder(); 4418 if (orderNo3 > orderNo) { 4419 opIsLive = true; 4420 break; 4421 } 4422 } 4423 4424 if (!opIsLive) 4425 for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { 4426 SDNode *User = *UI; 4427 int orderNo3 = User->getIROrder(); 4428 if (orderNo3 > orderNo) { 4429 opIsLive = true; 4430 break; 4431 } 4432 } 4433 4434 if (!opIsLive) 4435 return SDValue(); 4436 } 4437 4438 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4439 N0.getOperand(0), N0.getOperand(1), N1); 4440 } 4441 } 4442 4443 return SDValue(); 4444 } 4445 4446 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4447 /// 4448 static SDValue PerformADDCombine(SDNode *N, 4449 TargetLowering::DAGCombinerInfo &DCI, 4450 const NVPTXSubtarget &Subtarget, 4451 CodeGenOpt::Level OptLevel) { 4452 SDValue N0 = N->getOperand(0); 4453 SDValue N1 = N->getOperand(1); 4454 4455 // First try with the default operand order. 4456 if (SDValue Result = 4457 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4458 return Result; 4459 4460 // If that didn't work, try again with the operands commuted. 4461 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4462 } 4463 4464 static SDValue PerformANDCombine(SDNode *N, 4465 TargetLowering::DAGCombinerInfo &DCI) { 4466 // The type legalizer turns a vector load of i8 values into a zextload to i16 4467 // registers, optionally ANY_EXTENDs it (if target type is integer), 4468 // and ANDs off the high 8 bits. Since we turn this load into a 4469 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4470 // nodes. Do that here. 4471 SDValue Val = N->getOperand(0); 4472 SDValue Mask = N->getOperand(1); 4473 4474 if (isa<ConstantSDNode>(Val)) { 4475 std::swap(Val, Mask); 4476 } 4477 4478 SDValue AExt; 4479 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4480 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4481 AExt = Val; 4482 Val = Val->getOperand(0); 4483 } 4484 4485 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4486 Val = Val->getOperand(0); 4487 } 4488 4489 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4490 Val->getOpcode() == NVPTXISD::LoadV4) { 4491 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4492 if (!MaskCnst) { 4493 // Not an AND with a constant 4494 return SDValue(); 4495 } 4496 4497 uint64_t MaskVal = MaskCnst->getZExtValue(); 4498 if (MaskVal != 0xff) { 4499 // Not an AND that chops off top 8 bits 4500 return SDValue(); 4501 } 4502 4503 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4504 if (!Mem) { 4505 // Not a MemSDNode?!? 4506 return SDValue(); 4507 } 4508 4509 EVT MemVT = Mem->getMemoryVT(); 4510 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4511 // We only handle the i8 case 4512 return SDValue(); 4513 } 4514 4515 unsigned ExtType = 4516 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4517 getZExtValue(); 4518 if (ExtType == ISD::SEXTLOAD) { 4519 // If for some reason the load is a sextload, the and is needed to zero 4520 // out the high 8 bits 4521 return SDValue(); 4522 } 4523 4524 bool AddTo = false; 4525 if (AExt.getNode() != nullptr) { 4526 // Re-insert the ext as a zext. 4527 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4528 AExt.getValueType(), Val); 4529 AddTo = true; 4530 } 4531 4532 // If we get here, the AND is unnecessary. Just replace it with the load 4533 DCI.CombineTo(N, Val, AddTo); 4534 } 4535 4536 return SDValue(); 4537 } 4538 4539 static SDValue PerformREMCombine(SDNode *N, 4540 TargetLowering::DAGCombinerInfo &DCI, 4541 CodeGenOpt::Level OptLevel) { 4542 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4543 4544 // Don't do anything at less than -O2. 4545 if (OptLevel < CodeGenOpt::Default) 4546 return SDValue(); 4547 4548 SelectionDAG &DAG = DCI.DAG; 4549 SDLoc DL(N); 4550 EVT VT = N->getValueType(0); 4551 bool IsSigned = N->getOpcode() == ISD::SREM; 4552 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4553 4554 const SDValue &Num = N->getOperand(0); 4555 const SDValue &Den = N->getOperand(1); 4556 4557 for (const SDNode *U : Num->uses()) { 4558 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4559 U->getOperand(1) == Den) { 4560 // Num % Den -> Num - (Num / Den) * Den 4561 return DAG.getNode(ISD::SUB, DL, VT, Num, 4562 DAG.getNode(ISD::MUL, DL, VT, 4563 DAG.getNode(DivOpc, DL, VT, Num, Den), 4564 Den)); 4565 } 4566 } 4567 return SDValue(); 4568 } 4569 4570 enum OperandSignedness { 4571 Signed = 0, 4572 Unsigned, 4573 Unknown 4574 }; 4575 4576 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4577 /// that can be demoted to \p OptSize bits without loss of information. The 4578 /// signedness of the operand, if determinable, is placed in \p S. 4579 static bool IsMulWideOperandDemotable(SDValue Op, 4580 unsigned OptSize, 4581 OperandSignedness &S) { 4582 S = Unknown; 4583 4584 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4585 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4586 EVT OrigVT = Op.getOperand(0).getValueType(); 4587 if (OrigVT.getSizeInBits() <= OptSize) { 4588 S = Signed; 4589 return true; 4590 } 4591 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4592 EVT OrigVT = Op.getOperand(0).getValueType(); 4593 if (OrigVT.getSizeInBits() <= OptSize) { 4594 S = Unsigned; 4595 return true; 4596 } 4597 } 4598 4599 return false; 4600 } 4601 4602 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4603 /// be demoted to \p OptSize bits without loss of information. If the operands 4604 /// contain a constant, it should appear as the RHS operand. The signedness of 4605 /// the operands is placed in \p IsSigned. 4606 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4607 unsigned OptSize, 4608 bool &IsSigned) { 4609 OperandSignedness LHSSign; 4610 4611 // The LHS operand must be a demotable op 4612 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4613 return false; 4614 4615 // We should have been able to determine the signedness from the LHS 4616 if (LHSSign == Unknown) 4617 return false; 4618 4619 IsSigned = (LHSSign == Signed); 4620 4621 // The RHS can be a demotable op or a constant 4622 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4623 const APInt &Val = CI->getAPIntValue(); 4624 if (LHSSign == Unsigned) { 4625 return Val.isIntN(OptSize); 4626 } else { 4627 return Val.isSignedIntN(OptSize); 4628 } 4629 } else { 4630 OperandSignedness RHSSign; 4631 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4632 return false; 4633 4634 return LHSSign == RHSSign; 4635 } 4636 } 4637 4638 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4639 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4640 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4641 /// amount. 4642 static SDValue TryMULWIDECombine(SDNode *N, 4643 TargetLowering::DAGCombinerInfo &DCI) { 4644 EVT MulType = N->getValueType(0); 4645 if (MulType != MVT::i32 && MulType != MVT::i64) { 4646 return SDValue(); 4647 } 4648 4649 SDLoc DL(N); 4650 unsigned OptSize = MulType.getSizeInBits() >> 1; 4651 SDValue LHS = N->getOperand(0); 4652 SDValue RHS = N->getOperand(1); 4653 4654 // Canonicalize the multiply so the constant (if any) is on the right 4655 if (N->getOpcode() == ISD::MUL) { 4656 if (isa<ConstantSDNode>(LHS)) { 4657 std::swap(LHS, RHS); 4658 } 4659 } 4660 4661 // If we have a SHL, determine the actual multiply amount 4662 if (N->getOpcode() == ISD::SHL) { 4663 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4664 if (!ShlRHS) { 4665 return SDValue(); 4666 } 4667 4668 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4669 unsigned BitWidth = MulType.getSizeInBits(); 4670 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4671 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4672 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4673 } else { 4674 return SDValue(); 4675 } 4676 } 4677 4678 bool Signed; 4679 // Verify that our operands are demotable 4680 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4681 return SDValue(); 4682 } 4683 4684 EVT DemotedVT; 4685 if (MulType == MVT::i32) { 4686 DemotedVT = MVT::i16; 4687 } else { 4688 DemotedVT = MVT::i32; 4689 } 4690 4691 // Truncate the operands to the correct size. Note that these are just for 4692 // type consistency and will (likely) be eliminated in later phases. 4693 SDValue TruncLHS = 4694 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 4695 SDValue TruncRHS = 4696 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 4697 4698 unsigned Opc; 4699 if (Signed) { 4700 Opc = NVPTXISD::MUL_WIDE_SIGNED; 4701 } else { 4702 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 4703 } 4704 4705 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 4706 } 4707 4708 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 4709 static SDValue PerformMULCombine(SDNode *N, 4710 TargetLowering::DAGCombinerInfo &DCI, 4711 CodeGenOpt::Level OptLevel) { 4712 if (OptLevel > 0) { 4713 // Try mul.wide combining at OptLevel > 0 4714 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4715 return Ret; 4716 } 4717 4718 return SDValue(); 4719 } 4720 4721 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 4722 static SDValue PerformSHLCombine(SDNode *N, 4723 TargetLowering::DAGCombinerInfo &DCI, 4724 CodeGenOpt::Level OptLevel) { 4725 if (OptLevel > 0) { 4726 // Try mul.wide combining at OptLevel > 0 4727 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4728 return Ret; 4729 } 4730 4731 return SDValue(); 4732 } 4733 4734 static SDValue PerformSETCCCombine(SDNode *N, 4735 TargetLowering::DAGCombinerInfo &DCI) { 4736 EVT CCType = N->getValueType(0); 4737 SDValue A = N->getOperand(0); 4738 SDValue B = N->getOperand(1); 4739 4740 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 4741 return SDValue(); 4742 4743 SDLoc DL(N); 4744 // setp.f16x2 returns two scalar predicates, which we need to 4745 // convert back to v2i1. The returned result will be scalarized by 4746 // the legalizer, but the comparison will remain a single vector 4747 // instruction. 4748 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 4749 DCI.DAG.getVTList(MVT::i1, MVT::i1), 4750 {A, B, N->getOperand(2)}); 4751 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 4752 CCNode.getValue(1)); 4753 } 4754 4755 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 4756 DAGCombinerInfo &DCI) const { 4757 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 4758 switch (N->getOpcode()) { 4759 default: break; 4760 case ISD::ADD: 4761 case ISD::FADD: 4762 return PerformADDCombine(N, DCI, STI, OptLevel); 4763 case ISD::MUL: 4764 return PerformMULCombine(N, DCI, OptLevel); 4765 case ISD::SHL: 4766 return PerformSHLCombine(N, DCI, OptLevel); 4767 case ISD::AND: 4768 return PerformANDCombine(N, DCI); 4769 case ISD::UREM: 4770 case ISD::SREM: 4771 return PerformREMCombine(N, DCI, OptLevel); 4772 case ISD::SETCC: 4773 return PerformSETCCCombine(N, DCI); 4774 } 4775 return SDValue(); 4776 } 4777 4778 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 4779 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 4780 SmallVectorImpl<SDValue> &Results) { 4781 EVT ResVT = N->getValueType(0); 4782 SDLoc DL(N); 4783 4784 assert(ResVT.isVector() && "Vector load must have vector type"); 4785 4786 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 4787 // legal. We can (and should) split that into 2 loads of <2 x double> here 4788 // but I'm leaving that as a TODO for now. 4789 assert(ResVT.isSimple() && "Can only handle simple types"); 4790 switch (ResVT.getSimpleVT().SimpleTy) { 4791 default: 4792 return; 4793 case MVT::v2i8: 4794 case MVT::v2i16: 4795 case MVT::v2i32: 4796 case MVT::v2i64: 4797 case MVT::v2f16: 4798 case MVT::v2f32: 4799 case MVT::v2f64: 4800 case MVT::v4i8: 4801 case MVT::v4i16: 4802 case MVT::v4i32: 4803 case MVT::v4f16: 4804 case MVT::v4f32: 4805 case MVT::v8f16: // <4 x f16x2> 4806 // This is a "native" vector type 4807 break; 4808 } 4809 4810 LoadSDNode *LD = cast<LoadSDNode>(N); 4811 4812 unsigned Align = LD->getAlignment(); 4813 auto &TD = DAG.getDataLayout(); 4814 unsigned PrefAlign = 4815 TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext())); 4816 if (Align < PrefAlign) { 4817 // This load is not sufficiently aligned, so bail out and let this vector 4818 // load be scalarized. Note that we may still be able to emit smaller 4819 // vector loads. For example, if we are loading a <4 x float> with an 4820 // alignment of 8, this check will fail but the legalizer will try again 4821 // with 2 x <2 x float>, which will succeed with an alignment of 8. 4822 return; 4823 } 4824 4825 EVT EltVT = ResVT.getVectorElementType(); 4826 unsigned NumElts = ResVT.getVectorNumElements(); 4827 4828 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 4829 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4830 // loaded type to i16 and propagate the "real" type as the memory type. 4831 bool NeedTrunc = false; 4832 if (EltVT.getSizeInBits() < 16) { 4833 EltVT = MVT::i16; 4834 NeedTrunc = true; 4835 } 4836 4837 unsigned Opcode = 0; 4838 SDVTList LdResVTs; 4839 bool LoadF16x2 = false; 4840 4841 switch (NumElts) { 4842 default: 4843 return; 4844 case 2: 4845 Opcode = NVPTXISD::LoadV2; 4846 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4847 break; 4848 case 4: { 4849 Opcode = NVPTXISD::LoadV4; 4850 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4851 LdResVTs = DAG.getVTList(ListVTs); 4852 break; 4853 } 4854 case 8: { 4855 // v8f16 is a special case. PTX doesn't have ld.v8.f16 4856 // instruction. Instead, we split the vector into v2f16 chunks and 4857 // load them with ld.v4.b32. 4858 assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); 4859 LoadF16x2 = true; 4860 Opcode = NVPTXISD::LoadV4; 4861 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, 4862 MVT::Other}; 4863 LdResVTs = DAG.getVTList(ListVTs); 4864 break; 4865 } 4866 } 4867 4868 // Copy regular operands 4869 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 4870 4871 // The select routine does not have access to the LoadSDNode instance, so 4872 // pass along the extension information 4873 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 4874 4875 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 4876 LD->getMemoryVT(), 4877 LD->getMemOperand()); 4878 4879 SmallVector<SDValue, 8> ScalarRes; 4880 if (LoadF16x2) { 4881 // Split v2f16 subvectors back into individual elements. 4882 NumElts /= 2; 4883 for (unsigned i = 0; i < NumElts; ++i) { 4884 SDValue SubVector = NewLD.getValue(i); 4885 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4886 DAG.getIntPtrConstant(0, DL)); 4887 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4888 DAG.getIntPtrConstant(1, DL)); 4889 ScalarRes.push_back(E0); 4890 ScalarRes.push_back(E1); 4891 } 4892 } else { 4893 for (unsigned i = 0; i < NumElts; ++i) { 4894 SDValue Res = NewLD.getValue(i); 4895 if (NeedTrunc) 4896 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 4897 ScalarRes.push_back(Res); 4898 } 4899 } 4900 4901 SDValue LoadChain = NewLD.getValue(NumElts); 4902 4903 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 4904 4905 Results.push_back(BuildVec); 4906 Results.push_back(LoadChain); 4907 } 4908 4909 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 4910 SmallVectorImpl<SDValue> &Results) { 4911 SDValue Chain = N->getOperand(0); 4912 SDValue Intrin = N->getOperand(1); 4913 SDLoc DL(N); 4914 4915 // Get the intrinsic ID 4916 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 4917 switch (IntrinNo) { 4918 default: 4919 return; 4920 case Intrinsic::nvvm_ldg_global_i: 4921 case Intrinsic::nvvm_ldg_global_f: 4922 case Intrinsic::nvvm_ldg_global_p: 4923 case Intrinsic::nvvm_ldu_global_i: 4924 case Intrinsic::nvvm_ldu_global_f: 4925 case Intrinsic::nvvm_ldu_global_p: { 4926 EVT ResVT = N->getValueType(0); 4927 4928 if (ResVT.isVector()) { 4929 // Vector LDG/LDU 4930 4931 unsigned NumElts = ResVT.getVectorNumElements(); 4932 EVT EltVT = ResVT.getVectorElementType(); 4933 4934 // Since LDU/LDG are target nodes, we cannot rely on DAG type 4935 // legalization. 4936 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4937 // loaded type to i16 and propagate the "real" type as the memory type. 4938 bool NeedTrunc = false; 4939 if (EltVT.getSizeInBits() < 16) { 4940 EltVT = MVT::i16; 4941 NeedTrunc = true; 4942 } 4943 4944 unsigned Opcode = 0; 4945 SDVTList LdResVTs; 4946 4947 switch (NumElts) { 4948 default: 4949 return; 4950 case 2: 4951 switch (IntrinNo) { 4952 default: 4953 return; 4954 case Intrinsic::nvvm_ldg_global_i: 4955 case Intrinsic::nvvm_ldg_global_f: 4956 case Intrinsic::nvvm_ldg_global_p: 4957 Opcode = NVPTXISD::LDGV2; 4958 break; 4959 case Intrinsic::nvvm_ldu_global_i: 4960 case Intrinsic::nvvm_ldu_global_f: 4961 case Intrinsic::nvvm_ldu_global_p: 4962 Opcode = NVPTXISD::LDUV2; 4963 break; 4964 } 4965 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4966 break; 4967 case 4: { 4968 switch (IntrinNo) { 4969 default: 4970 return; 4971 case Intrinsic::nvvm_ldg_global_i: 4972 case Intrinsic::nvvm_ldg_global_f: 4973 case Intrinsic::nvvm_ldg_global_p: 4974 Opcode = NVPTXISD::LDGV4; 4975 break; 4976 case Intrinsic::nvvm_ldu_global_i: 4977 case Intrinsic::nvvm_ldu_global_f: 4978 case Intrinsic::nvvm_ldu_global_p: 4979 Opcode = NVPTXISD::LDUV4; 4980 break; 4981 } 4982 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4983 LdResVTs = DAG.getVTList(ListVTs); 4984 break; 4985 } 4986 } 4987 4988 SmallVector<SDValue, 8> OtherOps; 4989 4990 // Copy regular operands 4991 4992 OtherOps.push_back(Chain); // Chain 4993 // Skip operand 1 (intrinsic ID) 4994 // Others 4995 OtherOps.append(N->op_begin() + 2, N->op_end()); 4996 4997 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 4998 4999 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5000 MemSD->getMemoryVT(), 5001 MemSD->getMemOperand()); 5002 5003 SmallVector<SDValue, 4> ScalarRes; 5004 5005 for (unsigned i = 0; i < NumElts; ++i) { 5006 SDValue Res = NewLD.getValue(i); 5007 if (NeedTrunc) 5008 Res = 5009 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5010 ScalarRes.push_back(Res); 5011 } 5012 5013 SDValue LoadChain = NewLD.getValue(NumElts); 5014 5015 SDValue BuildVec = 5016 DAG.getBuildVector(ResVT, DL, ScalarRes); 5017 5018 Results.push_back(BuildVec); 5019 Results.push_back(LoadChain); 5020 } else { 5021 // i8 LDG/LDU 5022 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5023 "Custom handling of non-i8 ldu/ldg?"); 5024 5025 // Just copy all operands as-is 5026 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5027 5028 // Force output to i16 5029 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5030 5031 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5032 5033 // We make sure the memory type is i8, which will be used during isel 5034 // to select the proper instruction. 5035 SDValue NewLD = 5036 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5037 MVT::i8, MemSD->getMemOperand()); 5038 5039 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5040 NewLD.getValue(0))); 5041 Results.push_back(NewLD.getValue(1)); 5042 } 5043 } 5044 } 5045 } 5046 5047 void NVPTXTargetLowering::ReplaceNodeResults( 5048 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5049 switch (N->getOpcode()) { 5050 default: 5051 report_fatal_error("Unhandled custom legalization"); 5052 case ISD::LOAD: 5053 ReplaceLoadVector(N, DAG, Results); 5054 return; 5055 case ISD::INTRINSIC_W_CHAIN: 5056 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5057 return; 5058 } 5059 } 5060 5061 // Pin NVPTXTargetObjectFile's vtables to this file. 5062 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} 5063 5064 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5065 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5066 return getDataSection(); 5067 } 5068