1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/SmallVector.h" 23 #include "llvm/ADT/StringRef.h" 24 #include "llvm/CodeGen/Analysis.h" 25 #include "llvm/CodeGen/MachineFunction.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/SelectionDAG.h" 28 #include "llvm/CodeGen/SelectionDAGNodes.h" 29 #include "llvm/CodeGen/TargetCallingConv.h" 30 #include "llvm/CodeGen/TargetLowering.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/Argument.h" 33 #include "llvm/IR/Attributes.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/GlobalValue.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicsNVPTX.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/Type.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CodeGen.h" 47 #include "llvm/Support/CommandLine.h" 48 #include "llvm/Support/ErrorHandling.h" 49 #include "llvm/Support/MachineValueType.h" 50 #include "llvm/Support/MathExtras.h" 51 #include "llvm/Support/raw_ostream.h" 52 #include "llvm/Target/TargetMachine.h" 53 #include "llvm/Target/TargetOptions.h" 54 #include <algorithm> 55 #include <cassert> 56 #include <cstdint> 57 #include <iterator> 58 #include <sstream> 59 #include <string> 60 #include <utility> 61 #include <vector> 62 63 #define DEBUG_TYPE "nvptx-lower" 64 65 using namespace llvm; 66 67 static unsigned int uniqueCallSite = 0; 68 69 static cl::opt<bool> sched4reg( 70 "nvptx-sched4reg", 71 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 72 73 static cl::opt<unsigned> 74 FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden, 75 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 76 " 1: do it 2: do it aggressively"), 77 cl::init(2)); 78 79 static cl::opt<int> UsePrecDivF32( 80 "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden, 81 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 82 " IEEE Compliant F32 div.rnd if available."), 83 cl::init(2)); 84 85 static cl::opt<bool> UsePrecSqrtF32( 86 "nvptx-prec-sqrtf32", cl::Hidden, 87 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 88 cl::init(true)); 89 90 int NVPTXTargetLowering::getDivF32Level() const { 91 if (UsePrecDivF32.getNumOccurrences() > 0) { 92 // If nvptx-prec-div32=N is used on the command-line, always honor it 93 return UsePrecDivF32; 94 } else { 95 // Otherwise, use div.approx if fast math is enabled 96 if (getTargetMachine().Options.UnsafeFPMath) 97 return 0; 98 else 99 return 2; 100 } 101 } 102 103 bool NVPTXTargetLowering::usePrecSqrtF32() const { 104 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 105 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 106 return UsePrecSqrtF32; 107 } else { 108 // Otherwise, use sqrt.approx if fast math is enabled 109 return !getTargetMachine().Options.UnsafeFPMath; 110 } 111 } 112 113 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 114 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 115 DenormalMode::PreserveSign; 116 } 117 118 static bool IsPTXVectorType(MVT VT) { 119 switch (VT.SimpleTy) { 120 default: 121 return false; 122 case MVT::v2i1: 123 case MVT::v4i1: 124 case MVT::v2i8: 125 case MVT::v4i8: 126 case MVT::v2i16: 127 case MVT::v4i16: 128 case MVT::v2i32: 129 case MVT::v4i32: 130 case MVT::v2i64: 131 case MVT::v2f16: 132 case MVT::v4f16: 133 case MVT::v8f16: // <4 x f16x2> 134 case MVT::v2f32: 135 case MVT::v4f32: 136 case MVT::v2f64: 137 return true; 138 } 139 } 140 141 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 142 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 143 /// into their primitive components. 144 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 145 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 146 /// LowerCall, and LowerReturn. 147 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 148 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 149 SmallVectorImpl<uint64_t> *Offsets = nullptr, 150 uint64_t StartingOffset = 0) { 151 SmallVector<EVT, 16> TempVTs; 152 SmallVector<uint64_t, 16> TempOffsets; 153 154 // Special case for i128 - decompose to (i64, i64) 155 if (Ty->isIntegerTy(128)) { 156 ValueVTs.push_back(EVT(MVT::i64)); 157 ValueVTs.push_back(EVT(MVT::i64)); 158 159 if (Offsets) { 160 Offsets->push_back(StartingOffset + 0); 161 Offsets->push_back(StartingOffset + 8); 162 } 163 164 return; 165 } 166 167 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 168 if (StructType *STy = dyn_cast<StructType>(Ty)) { 169 auto const *SL = DL.getStructLayout(STy); 170 auto ElementNum = 0; 171 for(auto *EI : STy->elements()) { 172 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 173 StartingOffset + SL->getElementOffset(ElementNum)); 174 ++ElementNum; 175 } 176 return; 177 } 178 179 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 180 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 181 EVT VT = TempVTs[i]; 182 uint64_t Off = TempOffsets[i]; 183 // Split vectors into individual elements, except for v2f16, which 184 // we will pass as a single scalar. 185 if (VT.isVector()) { 186 unsigned NumElts = VT.getVectorNumElements(); 187 EVT EltVT = VT.getVectorElementType(); 188 // Vectors with an even number of f16 elements will be passed to 189 // us as an array of v2f16 elements. We must match this so we 190 // stay in sync with Ins/Outs. 191 if (EltVT == MVT::f16 && NumElts % 2 == 0) { 192 EltVT = MVT::v2f16; 193 NumElts /= 2; 194 } 195 for (unsigned j = 0; j != NumElts; ++j) { 196 ValueVTs.push_back(EltVT); 197 if (Offsets) 198 Offsets->push_back(Off + j * EltVT.getStoreSize()); 199 } 200 } else { 201 ValueVTs.push_back(VT); 202 if (Offsets) 203 Offsets->push_back(Off); 204 } 205 } 206 } 207 208 // Check whether we can merge loads/stores of some of the pieces of a 209 // flattened function parameter or return value into a single vector 210 // load/store. 211 // 212 // The flattened parameter is represented as a list of EVTs and 213 // offsets, and the whole structure is aligned to ParamAlignment. This 214 // function determines whether we can load/store pieces of the 215 // parameter starting at index Idx using a single vectorized op of 216 // size AccessSize. If so, it returns the number of param pieces 217 // covered by the vector op. Otherwise, it returns 1. 218 static unsigned CanMergeParamLoadStoresStartingAt( 219 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 220 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 221 222 // Can't vectorize if param alignment is not sufficient. 223 if (ParamAlignment < AccessSize) 224 return 1; 225 // Can't vectorize if offset is not aligned. 226 if (Offsets[Idx] & (AccessSize - 1)) 227 return 1; 228 229 EVT EltVT = ValueVTs[Idx]; 230 unsigned EltSize = EltVT.getStoreSize(); 231 232 // Element is too large to vectorize. 233 if (EltSize >= AccessSize) 234 return 1; 235 236 unsigned NumElts = AccessSize / EltSize; 237 // Can't vectorize if AccessBytes if not a multiple of EltSize. 238 if (AccessSize != EltSize * NumElts) 239 return 1; 240 241 // We don't have enough elements to vectorize. 242 if (Idx + NumElts > ValueVTs.size()) 243 return 1; 244 245 // PTX ISA can only deal with 2- and 4-element vector ops. 246 if (NumElts != 4 && NumElts != 2) 247 return 1; 248 249 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 250 // Types do not match. 251 if (ValueVTs[j] != EltVT) 252 return 1; 253 254 // Elements are not contiguous. 255 if (Offsets[j] - Offsets[j - 1] != EltSize) 256 return 1; 257 } 258 // OK. We can vectorize ValueVTs[i..i+NumElts) 259 return NumElts; 260 } 261 262 // Flags for tracking per-element vectorization state of loads/stores 263 // of a flattened function parameter or return value. 264 enum ParamVectorizationFlags { 265 PVF_INNER = 0x0, // Middle elements of a vector. 266 PVF_FIRST = 0x1, // First element of the vector. 267 PVF_LAST = 0x2, // Last element of the vector. 268 // Scalar is effectively a 1-element vector. 269 PVF_SCALAR = PVF_FIRST | PVF_LAST 270 }; 271 272 // Computes whether and how we can vectorize the loads/stores of a 273 // flattened function parameter or return value. 274 // 275 // The flattened parameter is represented as the list of ValueVTs and 276 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 277 // of the same size as ValueVTs indicating how each piece should be 278 // loaded/stored (i.e. as a scalar, or as part of a vector 279 // load/store). 280 static SmallVector<ParamVectorizationFlags, 16> 281 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 282 const SmallVectorImpl<uint64_t> &Offsets, 283 Align ParamAlignment) { 284 // Set vector size to match ValueVTs and mark all elements as 285 // scalars by default. 286 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 287 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 288 289 // Check what we can vectorize using 128/64/32-bit accesses. 290 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 291 // Skip elements we've already processed. 292 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 293 for (unsigned AccessSize : {16, 8, 4, 2}) { 294 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 295 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 296 // Mark vectorized elements. 297 switch (NumElts) { 298 default: 299 llvm_unreachable("Unexpected return value"); 300 case 1: 301 // Can't vectorize using this size, try next smaller size. 302 continue; 303 case 2: 304 assert(I + 1 < E && "Not enough elements."); 305 VectorInfo[I] = PVF_FIRST; 306 VectorInfo[I + 1] = PVF_LAST; 307 I += 1; 308 break; 309 case 4: 310 assert(I + 3 < E && "Not enough elements."); 311 VectorInfo[I] = PVF_FIRST; 312 VectorInfo[I + 1] = PVF_INNER; 313 VectorInfo[I + 2] = PVF_INNER; 314 VectorInfo[I + 3] = PVF_LAST; 315 I += 3; 316 break; 317 } 318 // Break out of the inner loop because we've already succeeded 319 // using largest possible AccessSize. 320 break; 321 } 322 } 323 return VectorInfo; 324 } 325 326 // NVPTXTargetLowering Constructor. 327 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 328 const NVPTXSubtarget &STI) 329 : TargetLowering(TM), nvTM(&TM), STI(STI) { 330 // always lower memset, memcpy, and memmove intrinsics to load/store 331 // instructions, rather 332 // then generating calls to memset, mempcy or memmove. 333 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 334 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 335 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 336 337 setBooleanContents(ZeroOrNegativeOneBooleanContent); 338 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 339 340 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 341 // condition branches. 342 setJumpIsExpensive(true); 343 344 // Wide divides are _very_ slow. Try to reduce the width of the divide if 345 // possible. 346 addBypassSlowDiv(64, 32); 347 348 // By default, use the Source scheduling 349 if (sched4reg) 350 setSchedulingPreference(Sched::RegPressure); 351 else 352 setSchedulingPreference(Sched::Source); 353 354 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 355 LegalizeAction NoF16Action) { 356 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 357 }; 358 359 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 360 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 361 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 362 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 363 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 364 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 365 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 366 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 367 368 // Conversion to/from FP16/FP16x2 is always legal. 369 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 370 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 371 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 372 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 373 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 374 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 375 376 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 377 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 378 379 // Operations not directly supported by NVPTX. 380 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 381 MVT::i16, MVT::i32, MVT::i64}) { 382 setOperationAction(ISD::SELECT_CC, VT, Expand); 383 setOperationAction(ISD::BR_CC, VT, Expand); 384 } 385 386 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 387 // For others we will expand to a SHL/SRA pair. 388 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 389 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 390 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 391 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 392 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 393 394 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 395 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 396 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 397 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 398 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 399 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 400 401 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 402 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 403 404 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 405 // that don't have h/w rotation we lower them to multi-instruction assembly. 406 // See ROT*_sw in NVPTXIntrInfo.td 407 setOperationAction(ISD::ROTL, MVT::i64, Legal); 408 setOperationAction(ISD::ROTR, MVT::i64, Legal); 409 setOperationAction(ISD::ROTL, MVT::i32, Legal); 410 setOperationAction(ISD::ROTR, MVT::i32, Legal); 411 412 setOperationAction(ISD::ROTL, MVT::i16, Expand); 413 setOperationAction(ISD::ROTR, MVT::i16, Expand); 414 setOperationAction(ISD::ROTL, MVT::i8, Expand); 415 setOperationAction(ISD::ROTR, MVT::i8, Expand); 416 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 417 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 418 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 419 420 // Indirect branch is not supported. 421 // This also disables Jump Table creation. 422 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 423 setOperationAction(ISD::BRIND, MVT::Other, Expand); 424 425 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 426 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 427 428 // We want to legalize constant related memmove and memcopy 429 // intrinsics. 430 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 431 432 // Turn FP extload into load/fpextend 433 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 434 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 435 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 436 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 437 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 438 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 439 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 440 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 441 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 442 // Turn FP truncstore into trunc + store. 443 // FIXME: vector types should also be expanded 444 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 445 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 446 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 447 448 // PTX does not support load / store predicate registers 449 setOperationAction(ISD::LOAD, MVT::i1, Custom); 450 setOperationAction(ISD::STORE, MVT::i1, Custom); 451 452 for (MVT VT : MVT::integer_valuetypes()) { 453 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 454 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 455 setTruncStoreAction(VT, MVT::i1, Expand); 456 } 457 458 // This is legal in NVPTX 459 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 460 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 461 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 462 463 // TRAP can be lowered to PTX trap 464 setOperationAction(ISD::TRAP, MVT::Other, Legal); 465 466 // Register custom handling for vector loads/stores 467 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 468 if (IsPTXVectorType(VT)) { 469 setOperationAction(ISD::LOAD, VT, Custom); 470 setOperationAction(ISD::STORE, VT, Custom); 471 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 472 } 473 } 474 475 // Custom handling for i8 intrinsics 476 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 477 478 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 479 setOperationAction(ISD::ABS, Ty, Legal); 480 setOperationAction(ISD::SMIN, Ty, Legal); 481 setOperationAction(ISD::SMAX, Ty, Legal); 482 setOperationAction(ISD::UMIN, Ty, Legal); 483 setOperationAction(ISD::UMAX, Ty, Legal); 484 485 setOperationAction(ISD::CTPOP, Ty, Legal); 486 setOperationAction(ISD::CTLZ, Ty, Legal); 487 } 488 489 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 490 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 491 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 492 493 // PTX does not directly support SELP of i1, so promote to i32 first 494 setOperationAction(ISD::SELECT, MVT::i1, Custom); 495 496 // PTX cannot multiply two i64s in a single instruction. 497 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 498 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 499 500 // We have some custom DAG combine patterns for these nodes 501 setTargetDAGCombine(ISD::ADD); 502 setTargetDAGCombine(ISD::AND); 503 setTargetDAGCombine(ISD::FADD); 504 setTargetDAGCombine(ISD::MUL); 505 setTargetDAGCombine(ISD::SHL); 506 setTargetDAGCombine(ISD::SREM); 507 setTargetDAGCombine(ISD::UREM); 508 509 // setcc for f16x2 needs special handling to prevent legalizer's 510 // attempt to scalarize it due to v2i1 not being legal. 511 if (STI.allowFP16Math()) 512 setTargetDAGCombine(ISD::SETCC); 513 514 // Promote fp16 arithmetic if fp16 hardware isn't available or the 515 // user passed --nvptx-no-fp16-math. The flag is useful because, 516 // although sm_53+ GPUs have some sort of FP16 support in 517 // hardware, only sm_53 and sm_60 have full implementation. Others 518 // only have token amount of hardware and are likely to run faster 519 // by using fp32 units instead. 520 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 521 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 522 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 523 } 524 525 // There's no neg.f16 instruction. Expand to (0-x). 526 setOperationAction(ISD::FNEG, MVT::f16, Expand); 527 setOperationAction(ISD::FNEG, MVT::v2f16, Expand); 528 529 // (would be) Library functions. 530 531 // These map to conversion instructions for scalar FP types. 532 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 533 ISD::FTRUNC}) { 534 setOperationAction(Op, MVT::f16, Legal); 535 setOperationAction(Op, MVT::f32, Legal); 536 setOperationAction(Op, MVT::f64, Legal); 537 setOperationAction(Op, MVT::v2f16, Expand); 538 } 539 540 setOperationAction(ISD::FROUND, MVT::f16, Promote); 541 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 542 setOperationAction(ISD::FROUND, MVT::f32, Custom); 543 setOperationAction(ISD::FROUND, MVT::f64, Custom); 544 545 546 // 'Expand' implements FCOPYSIGN without calling an external library. 547 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 548 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 549 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 550 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 551 552 // These map to corresponding instructions for f32/f64. f16 must be 553 // promoted to f32. v2f16 is expanded to f16, which is then promoted 554 // to f32. 555 for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, 556 ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) { 557 setOperationAction(Op, MVT::f16, Promote); 558 setOperationAction(Op, MVT::f32, Legal); 559 setOperationAction(Op, MVT::f64, Legal); 560 setOperationAction(Op, MVT::v2f16, Expand); 561 } 562 setOperationAction(ISD::FMINNUM, MVT::f16, Promote); 563 setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); 564 setOperationAction(ISD::FMINIMUM, MVT::f16, Promote); 565 setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote); 566 567 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 568 // No FPOW or FREM in PTX. 569 570 // Now deduce the information based on the above mentioned 571 // actions 572 computeRegisterProperties(STI.getRegisterInfo()); 573 } 574 575 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 576 switch ((NVPTXISD::NodeType)Opcode) { 577 case NVPTXISD::FIRST_NUMBER: 578 break; 579 case NVPTXISD::CALL: 580 return "NVPTXISD::CALL"; 581 case NVPTXISD::RET_FLAG: 582 return "NVPTXISD::RET_FLAG"; 583 case NVPTXISD::LOAD_PARAM: 584 return "NVPTXISD::LOAD_PARAM"; 585 case NVPTXISD::Wrapper: 586 return "NVPTXISD::Wrapper"; 587 case NVPTXISD::DeclareParam: 588 return "NVPTXISD::DeclareParam"; 589 case NVPTXISD::DeclareScalarParam: 590 return "NVPTXISD::DeclareScalarParam"; 591 case NVPTXISD::DeclareRet: 592 return "NVPTXISD::DeclareRet"; 593 case NVPTXISD::DeclareScalarRet: 594 return "NVPTXISD::DeclareScalarRet"; 595 case NVPTXISD::DeclareRetParam: 596 return "NVPTXISD::DeclareRetParam"; 597 case NVPTXISD::PrintCall: 598 return "NVPTXISD::PrintCall"; 599 case NVPTXISD::PrintConvergentCall: 600 return "NVPTXISD::PrintConvergentCall"; 601 case NVPTXISD::PrintCallUni: 602 return "NVPTXISD::PrintCallUni"; 603 case NVPTXISD::PrintConvergentCallUni: 604 return "NVPTXISD::PrintConvergentCallUni"; 605 case NVPTXISD::LoadParam: 606 return "NVPTXISD::LoadParam"; 607 case NVPTXISD::LoadParamV2: 608 return "NVPTXISD::LoadParamV2"; 609 case NVPTXISD::LoadParamV4: 610 return "NVPTXISD::LoadParamV4"; 611 case NVPTXISD::StoreParam: 612 return "NVPTXISD::StoreParam"; 613 case NVPTXISD::StoreParamV2: 614 return "NVPTXISD::StoreParamV2"; 615 case NVPTXISD::StoreParamV4: 616 return "NVPTXISD::StoreParamV4"; 617 case NVPTXISD::StoreParamS32: 618 return "NVPTXISD::StoreParamS32"; 619 case NVPTXISD::StoreParamU32: 620 return "NVPTXISD::StoreParamU32"; 621 case NVPTXISD::CallArgBegin: 622 return "NVPTXISD::CallArgBegin"; 623 case NVPTXISD::CallArg: 624 return "NVPTXISD::CallArg"; 625 case NVPTXISD::LastCallArg: 626 return "NVPTXISD::LastCallArg"; 627 case NVPTXISD::CallArgEnd: 628 return "NVPTXISD::CallArgEnd"; 629 case NVPTXISD::CallVoid: 630 return "NVPTXISD::CallVoid"; 631 case NVPTXISD::CallVal: 632 return "NVPTXISD::CallVal"; 633 case NVPTXISD::CallSymbol: 634 return "NVPTXISD::CallSymbol"; 635 case NVPTXISD::Prototype: 636 return "NVPTXISD::Prototype"; 637 case NVPTXISD::MoveParam: 638 return "NVPTXISD::MoveParam"; 639 case NVPTXISD::StoreRetval: 640 return "NVPTXISD::StoreRetval"; 641 case NVPTXISD::StoreRetvalV2: 642 return "NVPTXISD::StoreRetvalV2"; 643 case NVPTXISD::StoreRetvalV4: 644 return "NVPTXISD::StoreRetvalV4"; 645 case NVPTXISD::PseudoUseParam: 646 return "NVPTXISD::PseudoUseParam"; 647 case NVPTXISD::RETURN: 648 return "NVPTXISD::RETURN"; 649 case NVPTXISD::CallSeqBegin: 650 return "NVPTXISD::CallSeqBegin"; 651 case NVPTXISD::CallSeqEnd: 652 return "NVPTXISD::CallSeqEnd"; 653 case NVPTXISD::CallPrototype: 654 return "NVPTXISD::CallPrototype"; 655 case NVPTXISD::ProxyReg: 656 return "NVPTXISD::ProxyReg"; 657 case NVPTXISD::LoadV2: 658 return "NVPTXISD::LoadV2"; 659 case NVPTXISD::LoadV4: 660 return "NVPTXISD::LoadV4"; 661 case NVPTXISD::LDGV2: 662 return "NVPTXISD::LDGV2"; 663 case NVPTXISD::LDGV4: 664 return "NVPTXISD::LDGV4"; 665 case NVPTXISD::LDUV2: 666 return "NVPTXISD::LDUV2"; 667 case NVPTXISD::LDUV4: 668 return "NVPTXISD::LDUV4"; 669 case NVPTXISD::StoreV2: 670 return "NVPTXISD::StoreV2"; 671 case NVPTXISD::StoreV4: 672 return "NVPTXISD::StoreV4"; 673 case NVPTXISD::FUN_SHFL_CLAMP: 674 return "NVPTXISD::FUN_SHFL_CLAMP"; 675 case NVPTXISD::FUN_SHFR_CLAMP: 676 return "NVPTXISD::FUN_SHFR_CLAMP"; 677 case NVPTXISD::IMAD: 678 return "NVPTXISD::IMAD"; 679 case NVPTXISD::SETP_F16X2: 680 return "NVPTXISD::SETP_F16X2"; 681 case NVPTXISD::Dummy: 682 return "NVPTXISD::Dummy"; 683 case NVPTXISD::MUL_WIDE_SIGNED: 684 return "NVPTXISD::MUL_WIDE_SIGNED"; 685 case NVPTXISD::MUL_WIDE_UNSIGNED: 686 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 687 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 688 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 689 case NVPTXISD::Tex1DFloatFloatLevel: 690 return "NVPTXISD::Tex1DFloatFloatLevel"; 691 case NVPTXISD::Tex1DFloatFloatGrad: 692 return "NVPTXISD::Tex1DFloatFloatGrad"; 693 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 694 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 695 case NVPTXISD::Tex1DS32FloatLevel: 696 return "NVPTXISD::Tex1DS32FloatLevel"; 697 case NVPTXISD::Tex1DS32FloatGrad: 698 return "NVPTXISD::Tex1DS32FloatGrad"; 699 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 700 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 701 case NVPTXISD::Tex1DU32FloatLevel: 702 return "NVPTXISD::Tex1DU32FloatLevel"; 703 case NVPTXISD::Tex1DU32FloatGrad: 704 return "NVPTXISD::Tex1DU32FloatGrad"; 705 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 706 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 707 case NVPTXISD::Tex1DArrayFloatFloatLevel: 708 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 709 case NVPTXISD::Tex1DArrayFloatFloatGrad: 710 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 711 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 712 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 713 case NVPTXISD::Tex1DArrayS32FloatLevel: 714 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 715 case NVPTXISD::Tex1DArrayS32FloatGrad: 716 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 717 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 718 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 719 case NVPTXISD::Tex1DArrayU32FloatLevel: 720 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 721 case NVPTXISD::Tex1DArrayU32FloatGrad: 722 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 723 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 724 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 725 case NVPTXISD::Tex2DFloatFloatLevel: 726 return "NVPTXISD::Tex2DFloatFloatLevel"; 727 case NVPTXISD::Tex2DFloatFloatGrad: 728 return "NVPTXISD::Tex2DFloatFloatGrad"; 729 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 730 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 731 case NVPTXISD::Tex2DS32FloatLevel: 732 return "NVPTXISD::Tex2DS32FloatLevel"; 733 case NVPTXISD::Tex2DS32FloatGrad: 734 return "NVPTXISD::Tex2DS32FloatGrad"; 735 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 736 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 737 case NVPTXISD::Tex2DU32FloatLevel: 738 return "NVPTXISD::Tex2DU32FloatLevel"; 739 case NVPTXISD::Tex2DU32FloatGrad: 740 return "NVPTXISD::Tex2DU32FloatGrad"; 741 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 742 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 743 case NVPTXISD::Tex2DArrayFloatFloatLevel: 744 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 745 case NVPTXISD::Tex2DArrayFloatFloatGrad: 746 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 747 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 748 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 749 case NVPTXISD::Tex2DArrayS32FloatLevel: 750 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 751 case NVPTXISD::Tex2DArrayS32FloatGrad: 752 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 753 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 754 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 755 case NVPTXISD::Tex2DArrayU32FloatLevel: 756 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 757 case NVPTXISD::Tex2DArrayU32FloatGrad: 758 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 759 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 760 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 761 case NVPTXISD::Tex3DFloatFloatLevel: 762 return "NVPTXISD::Tex3DFloatFloatLevel"; 763 case NVPTXISD::Tex3DFloatFloatGrad: 764 return "NVPTXISD::Tex3DFloatFloatGrad"; 765 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 766 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 767 case NVPTXISD::Tex3DS32FloatLevel: 768 return "NVPTXISD::Tex3DS32FloatLevel"; 769 case NVPTXISD::Tex3DS32FloatGrad: 770 return "NVPTXISD::Tex3DS32FloatGrad"; 771 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 772 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 773 case NVPTXISD::Tex3DU32FloatLevel: 774 return "NVPTXISD::Tex3DU32FloatLevel"; 775 case NVPTXISD::Tex3DU32FloatGrad: 776 return "NVPTXISD::Tex3DU32FloatGrad"; 777 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 778 case NVPTXISD::TexCubeFloatFloatLevel: 779 return "NVPTXISD::TexCubeFloatFloatLevel"; 780 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 781 case NVPTXISD::TexCubeS32FloatLevel: 782 return "NVPTXISD::TexCubeS32FloatLevel"; 783 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 784 case NVPTXISD::TexCubeU32FloatLevel: 785 return "NVPTXISD::TexCubeU32FloatLevel"; 786 case NVPTXISD::TexCubeArrayFloatFloat: 787 return "NVPTXISD::TexCubeArrayFloatFloat"; 788 case NVPTXISD::TexCubeArrayFloatFloatLevel: 789 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 790 case NVPTXISD::TexCubeArrayS32Float: 791 return "NVPTXISD::TexCubeArrayS32Float"; 792 case NVPTXISD::TexCubeArrayS32FloatLevel: 793 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 794 case NVPTXISD::TexCubeArrayU32Float: 795 return "NVPTXISD::TexCubeArrayU32Float"; 796 case NVPTXISD::TexCubeArrayU32FloatLevel: 797 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 798 case NVPTXISD::Tld4R2DFloatFloat: 799 return "NVPTXISD::Tld4R2DFloatFloat"; 800 case NVPTXISD::Tld4G2DFloatFloat: 801 return "NVPTXISD::Tld4G2DFloatFloat"; 802 case NVPTXISD::Tld4B2DFloatFloat: 803 return "NVPTXISD::Tld4B2DFloatFloat"; 804 case NVPTXISD::Tld4A2DFloatFloat: 805 return "NVPTXISD::Tld4A2DFloatFloat"; 806 case NVPTXISD::Tld4R2DS64Float: 807 return "NVPTXISD::Tld4R2DS64Float"; 808 case NVPTXISD::Tld4G2DS64Float: 809 return "NVPTXISD::Tld4G2DS64Float"; 810 case NVPTXISD::Tld4B2DS64Float: 811 return "NVPTXISD::Tld4B2DS64Float"; 812 case NVPTXISD::Tld4A2DS64Float: 813 return "NVPTXISD::Tld4A2DS64Float"; 814 case NVPTXISD::Tld4R2DU64Float: 815 return "NVPTXISD::Tld4R2DU64Float"; 816 case NVPTXISD::Tld4G2DU64Float: 817 return "NVPTXISD::Tld4G2DU64Float"; 818 case NVPTXISD::Tld4B2DU64Float: 819 return "NVPTXISD::Tld4B2DU64Float"; 820 case NVPTXISD::Tld4A2DU64Float: 821 return "NVPTXISD::Tld4A2DU64Float"; 822 823 case NVPTXISD::TexUnified1DFloatS32: 824 return "NVPTXISD::TexUnified1DFloatS32"; 825 case NVPTXISD::TexUnified1DFloatFloat: 826 return "NVPTXISD::TexUnified1DFloatFloat"; 827 case NVPTXISD::TexUnified1DFloatFloatLevel: 828 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 829 case NVPTXISD::TexUnified1DFloatFloatGrad: 830 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 831 case NVPTXISD::TexUnified1DS32S32: 832 return "NVPTXISD::TexUnified1DS32S32"; 833 case NVPTXISD::TexUnified1DS32Float: 834 return "NVPTXISD::TexUnified1DS32Float"; 835 case NVPTXISD::TexUnified1DS32FloatLevel: 836 return "NVPTXISD::TexUnified1DS32FloatLevel"; 837 case NVPTXISD::TexUnified1DS32FloatGrad: 838 return "NVPTXISD::TexUnified1DS32FloatGrad"; 839 case NVPTXISD::TexUnified1DU32S32: 840 return "NVPTXISD::TexUnified1DU32S32"; 841 case NVPTXISD::TexUnified1DU32Float: 842 return "NVPTXISD::TexUnified1DU32Float"; 843 case NVPTXISD::TexUnified1DU32FloatLevel: 844 return "NVPTXISD::TexUnified1DU32FloatLevel"; 845 case NVPTXISD::TexUnified1DU32FloatGrad: 846 return "NVPTXISD::TexUnified1DU32FloatGrad"; 847 case NVPTXISD::TexUnified1DArrayFloatS32: 848 return "NVPTXISD::TexUnified1DArrayFloatS32"; 849 case NVPTXISD::TexUnified1DArrayFloatFloat: 850 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 851 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 852 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 853 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 854 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 855 case NVPTXISD::TexUnified1DArrayS32S32: 856 return "NVPTXISD::TexUnified1DArrayS32S32"; 857 case NVPTXISD::TexUnified1DArrayS32Float: 858 return "NVPTXISD::TexUnified1DArrayS32Float"; 859 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 860 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 861 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 862 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 863 case NVPTXISD::TexUnified1DArrayU32S32: 864 return "NVPTXISD::TexUnified1DArrayU32S32"; 865 case NVPTXISD::TexUnified1DArrayU32Float: 866 return "NVPTXISD::TexUnified1DArrayU32Float"; 867 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 868 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 869 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 870 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 871 case NVPTXISD::TexUnified2DFloatS32: 872 return "NVPTXISD::TexUnified2DFloatS32"; 873 case NVPTXISD::TexUnified2DFloatFloat: 874 return "NVPTXISD::TexUnified2DFloatFloat"; 875 case NVPTXISD::TexUnified2DFloatFloatLevel: 876 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 877 case NVPTXISD::TexUnified2DFloatFloatGrad: 878 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 879 case NVPTXISD::TexUnified2DS32S32: 880 return "NVPTXISD::TexUnified2DS32S32"; 881 case NVPTXISD::TexUnified2DS32Float: 882 return "NVPTXISD::TexUnified2DS32Float"; 883 case NVPTXISD::TexUnified2DS32FloatLevel: 884 return "NVPTXISD::TexUnified2DS32FloatLevel"; 885 case NVPTXISD::TexUnified2DS32FloatGrad: 886 return "NVPTXISD::TexUnified2DS32FloatGrad"; 887 case NVPTXISD::TexUnified2DU32S32: 888 return "NVPTXISD::TexUnified2DU32S32"; 889 case NVPTXISD::TexUnified2DU32Float: 890 return "NVPTXISD::TexUnified2DU32Float"; 891 case NVPTXISD::TexUnified2DU32FloatLevel: 892 return "NVPTXISD::TexUnified2DU32FloatLevel"; 893 case NVPTXISD::TexUnified2DU32FloatGrad: 894 return "NVPTXISD::TexUnified2DU32FloatGrad"; 895 case NVPTXISD::TexUnified2DArrayFloatS32: 896 return "NVPTXISD::TexUnified2DArrayFloatS32"; 897 case NVPTXISD::TexUnified2DArrayFloatFloat: 898 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 899 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 900 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 901 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 902 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 903 case NVPTXISD::TexUnified2DArrayS32S32: 904 return "NVPTXISD::TexUnified2DArrayS32S32"; 905 case NVPTXISD::TexUnified2DArrayS32Float: 906 return "NVPTXISD::TexUnified2DArrayS32Float"; 907 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 908 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 909 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 910 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 911 case NVPTXISD::TexUnified2DArrayU32S32: 912 return "NVPTXISD::TexUnified2DArrayU32S32"; 913 case NVPTXISD::TexUnified2DArrayU32Float: 914 return "NVPTXISD::TexUnified2DArrayU32Float"; 915 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 916 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 917 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 918 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 919 case NVPTXISD::TexUnified3DFloatS32: 920 return "NVPTXISD::TexUnified3DFloatS32"; 921 case NVPTXISD::TexUnified3DFloatFloat: 922 return "NVPTXISD::TexUnified3DFloatFloat"; 923 case NVPTXISD::TexUnified3DFloatFloatLevel: 924 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 925 case NVPTXISD::TexUnified3DFloatFloatGrad: 926 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 927 case NVPTXISD::TexUnified3DS32S32: 928 return "NVPTXISD::TexUnified3DS32S32"; 929 case NVPTXISD::TexUnified3DS32Float: 930 return "NVPTXISD::TexUnified3DS32Float"; 931 case NVPTXISD::TexUnified3DS32FloatLevel: 932 return "NVPTXISD::TexUnified3DS32FloatLevel"; 933 case NVPTXISD::TexUnified3DS32FloatGrad: 934 return "NVPTXISD::TexUnified3DS32FloatGrad"; 935 case NVPTXISD::TexUnified3DU32S32: 936 return "NVPTXISD::TexUnified3DU32S32"; 937 case NVPTXISD::TexUnified3DU32Float: 938 return "NVPTXISD::TexUnified3DU32Float"; 939 case NVPTXISD::TexUnified3DU32FloatLevel: 940 return "NVPTXISD::TexUnified3DU32FloatLevel"; 941 case NVPTXISD::TexUnified3DU32FloatGrad: 942 return "NVPTXISD::TexUnified3DU32FloatGrad"; 943 case NVPTXISD::TexUnifiedCubeFloatFloat: 944 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 945 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 946 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 947 case NVPTXISD::TexUnifiedCubeS32Float: 948 return "NVPTXISD::TexUnifiedCubeS32Float"; 949 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 950 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 951 case NVPTXISD::TexUnifiedCubeU32Float: 952 return "NVPTXISD::TexUnifiedCubeU32Float"; 953 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 954 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 955 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 956 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 957 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 958 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 959 case NVPTXISD::TexUnifiedCubeArrayS32Float: 960 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 961 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 962 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 963 case NVPTXISD::TexUnifiedCubeArrayU32Float: 964 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 965 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 966 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 967 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 968 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 969 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 970 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 971 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 972 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 973 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 974 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 975 case NVPTXISD::Tld4UnifiedR2DS64Float: 976 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 977 case NVPTXISD::Tld4UnifiedG2DS64Float: 978 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 979 case NVPTXISD::Tld4UnifiedB2DS64Float: 980 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 981 case NVPTXISD::Tld4UnifiedA2DS64Float: 982 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 983 case NVPTXISD::Tld4UnifiedR2DU64Float: 984 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 985 case NVPTXISD::Tld4UnifiedG2DU64Float: 986 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 987 case NVPTXISD::Tld4UnifiedB2DU64Float: 988 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 989 case NVPTXISD::Tld4UnifiedA2DU64Float: 990 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 991 992 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 993 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 994 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 995 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 996 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 997 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 998 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 999 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1000 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1001 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1002 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1003 1004 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1005 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1006 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1007 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1008 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1009 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1010 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1011 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1012 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1013 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1014 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1015 1016 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1017 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1018 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1019 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1020 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1021 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1022 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1023 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1024 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1025 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1026 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1027 1028 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1029 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1030 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1031 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1032 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1033 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1034 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1035 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1036 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1037 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1038 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1039 1040 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1041 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1042 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1043 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1044 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1045 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1046 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1047 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1048 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1049 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1050 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1051 1052 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1053 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1054 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1055 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1056 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1057 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1058 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1059 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1060 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1061 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1062 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1063 1064 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1065 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1066 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1067 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1068 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1069 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1070 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1071 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1072 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1073 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1074 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1075 1076 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1077 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1078 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1079 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1080 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1081 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1082 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1083 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1084 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1085 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1086 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1087 1088 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1089 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1090 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1091 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1092 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1093 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1094 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1095 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1096 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1097 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1098 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1099 1100 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1101 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1102 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1103 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1104 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1105 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1106 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1107 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1108 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1109 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1110 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1111 1112 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1113 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1114 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1115 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1116 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1117 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1118 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1119 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1120 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1121 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1122 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1123 1124 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1125 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1126 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1127 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1128 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1129 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1130 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1131 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1132 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1133 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1134 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1135 1136 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1137 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1138 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1139 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1140 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1141 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1142 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1143 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1144 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1145 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1146 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1147 1148 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1149 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1150 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1151 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1152 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1153 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1154 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1155 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1156 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1157 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1158 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1159 1160 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1161 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1162 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1163 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1164 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1165 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1166 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1167 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1168 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1169 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1170 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1171 } 1172 return nullptr; 1173 } 1174 1175 TargetLoweringBase::LegalizeTypeAction 1176 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1177 if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1) 1178 return TypeSplitVector; 1179 if (VT == MVT::v2f16) 1180 return TypeLegal; 1181 return TargetLoweringBase::getPreferredVectorAction(VT); 1182 } 1183 1184 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1185 int Enabled, int &ExtraSteps, 1186 bool &UseOneConst, 1187 bool Reciprocal) const { 1188 if (!(Enabled == ReciprocalEstimate::Enabled || 1189 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1190 return SDValue(); 1191 1192 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1193 ExtraSteps = 0; 1194 1195 SDLoc DL(Operand); 1196 EVT VT = Operand.getValueType(); 1197 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1198 1199 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1200 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1201 DAG.getConstant(IID, DL, MVT::i32), Operand); 1202 }; 1203 1204 // The sqrt and rsqrt refinement processes assume we always start out with an 1205 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1206 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1207 // any refinement, we must return a regular sqrt. 1208 if (Reciprocal || ExtraSteps > 0) { 1209 if (VT == MVT::f32) 1210 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1211 : Intrinsic::nvvm_rsqrt_approx_f); 1212 else if (VT == MVT::f64) 1213 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1214 else 1215 return SDValue(); 1216 } else { 1217 if (VT == MVT::f32) 1218 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1219 : Intrinsic::nvvm_sqrt_approx_f); 1220 else { 1221 // There's no sqrt.approx.f64 instruction, so we emit 1222 // reciprocal(rsqrt(x)). This is faster than 1223 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1224 // x * rsqrt(x).) 1225 return DAG.getNode( 1226 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1227 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1228 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1229 } 1230 } 1231 } 1232 1233 SDValue 1234 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1235 SDLoc dl(Op); 1236 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1237 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1238 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1239 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1240 } 1241 1242 std::string NVPTXTargetLowering::getPrototype( 1243 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1244 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1245 const CallBase &CB) const { 1246 auto PtrVT = getPointerTy(DL); 1247 1248 bool isABI = (STI.getSmVersion() >= 20); 1249 assert(isABI && "Non-ABI compilation is not supported"); 1250 if (!isABI) 1251 return ""; 1252 1253 std::stringstream O; 1254 O << "prototype_" << uniqueCallSite << " : .callprototype "; 1255 1256 if (retTy->getTypeID() == Type::VoidTyID) { 1257 O << "()"; 1258 } else { 1259 O << "("; 1260 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1261 unsigned size = 0; 1262 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1263 size = ITy->getBitWidth(); 1264 } else { 1265 assert(retTy->isFloatingPointTy() && 1266 "Floating point type expected here"); 1267 size = retTy->getPrimitiveSizeInBits(); 1268 } 1269 // PTX ABI requires all scalar return values to be at least 32 1270 // bits in size. fp16 normally uses .b16 as its storage type in 1271 // PTX, so its size must be adjusted here, too. 1272 if (size < 32) 1273 size = 32; 1274 1275 O << ".param .b" << size << " _"; 1276 } else if (isa<PointerType>(retTy)) { 1277 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1278 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1279 retTy->isIntegerTy(128)) { 1280 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1281 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1282 } else { 1283 llvm_unreachable("Unknown return type"); 1284 } 1285 O << ") "; 1286 } 1287 O << "_ ("; 1288 1289 bool first = true; 1290 1291 unsigned OIdx = 0; 1292 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1293 Type *Ty = Args[i].Ty; 1294 if (!first) { 1295 O << ", "; 1296 } 1297 first = false; 1298 1299 if (!Outs[OIdx].Flags.isByVal()) { 1300 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1301 unsigned align = 0; 1302 const CallInst *CallI = cast<CallInst>(&CB); 1303 // +1 because index 0 is reserved for return type alignment 1304 if (!getAlign(*CallI, i + 1, align)) 1305 align = DL.getABITypeAlignment(Ty); 1306 unsigned sz = DL.getTypeAllocSize(Ty); 1307 O << ".param .align " << align << " .b8 "; 1308 O << "_"; 1309 O << "[" << sz << "]"; 1310 // update the index for Outs 1311 SmallVector<EVT, 16> vtparts; 1312 ComputeValueVTs(*this, DL, Ty, vtparts); 1313 if (unsigned len = vtparts.size()) 1314 OIdx += len - 1; 1315 continue; 1316 } 1317 // i8 types in IR will be i16 types in SDAG 1318 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1319 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1320 "type mismatch between callee prototype and arguments"); 1321 // scalar type 1322 unsigned sz = 0; 1323 if (isa<IntegerType>(Ty)) { 1324 sz = cast<IntegerType>(Ty)->getBitWidth(); 1325 if (sz < 32) 1326 sz = 32; 1327 } else if (isa<PointerType>(Ty)) { 1328 sz = PtrVT.getSizeInBits(); 1329 } else if (Ty->isHalfTy()) 1330 // PTX ABI requires all scalar parameters to be at least 32 1331 // bits in size. fp16 normally uses .b16 as its storage type 1332 // in PTX, so its size must be adjusted here, too. 1333 sz = 32; 1334 else 1335 sz = Ty->getPrimitiveSizeInBits(); 1336 O << ".param .b" << sz << " "; 1337 O << "_"; 1338 continue; 1339 } 1340 auto *PTy = dyn_cast<PointerType>(Ty); 1341 assert(PTy && "Param with byval attribute should be a pointer type"); 1342 Type *ETy = PTy->getElementType(); 1343 1344 Align align = Outs[OIdx].Flags.getNonZeroByValAlign(); 1345 unsigned sz = DL.getTypeAllocSize(ETy); 1346 O << ".param .align " << align.value() << " .b8 "; 1347 O << "_"; 1348 O << "[" << sz << "]"; 1349 } 1350 O << ");"; 1351 return O.str(); 1352 } 1353 1354 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1355 const CallBase *CB, Type *Ty, 1356 unsigned Idx, 1357 const DataLayout &DL) const { 1358 if (!CB) { 1359 // CallSite is zero, fallback to ABI type alignment 1360 return DL.getABITypeAlign(Ty); 1361 } 1362 1363 unsigned Alignment = 0; 1364 const Function *DirectCallee = CB->getCalledFunction(); 1365 1366 if (!DirectCallee) { 1367 // We don't have a direct function symbol, but that may be because of 1368 // constant cast instructions in the call. 1369 1370 // With bitcast'd call targets, the instruction will be the call 1371 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1372 // Check if we have call alignment metadata 1373 if (getAlign(*CI, Idx, Alignment)) 1374 return Align(Alignment); 1375 1376 const Value *CalleeV = CI->getCalledOperand(); 1377 // Ignore any bitcast instructions 1378 while (isa<ConstantExpr>(CalleeV)) { 1379 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 1380 if (!CE->isCast()) 1381 break; 1382 // Look through the bitcast 1383 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 1384 } 1385 1386 // We have now looked past all of the bitcasts. Do we finally have a 1387 // Function? 1388 if (const auto *CalleeF = dyn_cast<Function>(CalleeV)) 1389 DirectCallee = CalleeF; 1390 } 1391 } 1392 1393 // Check for function alignment information if we found that the 1394 // ultimate target is a Function 1395 if (DirectCallee) 1396 if (getAlign(*DirectCallee, Idx, Alignment)) 1397 return Align(Alignment); 1398 1399 // Call is indirect or alignment information is not available, fall back to 1400 // the ABI type alignment 1401 return DL.getABITypeAlign(Ty); 1402 } 1403 1404 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1405 SmallVectorImpl<SDValue> &InVals) const { 1406 SelectionDAG &DAG = CLI.DAG; 1407 SDLoc dl = CLI.DL; 1408 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1409 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1410 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1411 SDValue Chain = CLI.Chain; 1412 SDValue Callee = CLI.Callee; 1413 bool &isTailCall = CLI.IsTailCall; 1414 ArgListTy &Args = CLI.getArgs(); 1415 Type *RetTy = CLI.RetTy; 1416 const CallBase *CB = CLI.CB; 1417 const DataLayout &DL = DAG.getDataLayout(); 1418 1419 bool isABI = (STI.getSmVersion() >= 20); 1420 assert(isABI && "Non-ABI compilation is not supported"); 1421 if (!isABI) 1422 return Chain; 1423 1424 SDValue tempChain = Chain; 1425 Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); 1426 SDValue InFlag = Chain.getValue(1); 1427 1428 unsigned paramCount = 0; 1429 // Args.size() and Outs.size() need not match. 1430 // Outs.size() will be larger 1431 // * if there is an aggregate argument with multiple fields (each field 1432 // showing up separately in Outs) 1433 // * if there is a vector argument with more than typical vector-length 1434 // elements (generally if more than 4) where each vector element is 1435 // individually present in Outs. 1436 // So a different index should be used for indexing into Outs/OutVals. 1437 // See similar issue in LowerFormalArguments. 1438 unsigned OIdx = 0; 1439 // Declare the .params or .reg need to pass values 1440 // to the function 1441 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1442 EVT VT = Outs[OIdx].VT; 1443 Type *Ty = Args[i].Ty; 1444 1445 if (!Outs[OIdx].Flags.isByVal()) { 1446 SmallVector<EVT, 16> VTs; 1447 SmallVector<uint64_t, 16> Offsets; 1448 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets); 1449 Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL); 1450 unsigned AllocSize = DL.getTypeAllocSize(Ty); 1451 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1452 bool NeedAlign; // Does argument declaration specify alignment? 1453 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1454 // declare .param .align <align> .b8 .param<n>[<size>]; 1455 SDValue DeclareParamOps[] = { 1456 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1457 DAG.getConstant(paramCount, dl, MVT::i32), 1458 DAG.getConstant(AllocSize, dl, MVT::i32), InFlag}; 1459 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1460 DeclareParamOps); 1461 NeedAlign = true; 1462 } else { 1463 // declare .param .b<size> .param<n>; 1464 if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) { 1465 // PTX ABI requires integral types to be at least 32 bits in 1466 // size. FP16 is loaded/stored using i16, so it's handled 1467 // here as well. 1468 AllocSize = 4; 1469 } 1470 SDValue DeclareScalarParamOps[] = { 1471 Chain, DAG.getConstant(paramCount, dl, MVT::i32), 1472 DAG.getConstant(AllocSize * 8, dl, MVT::i32), 1473 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1474 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1475 DeclareScalarParamOps); 1476 NeedAlign = false; 1477 } 1478 InFlag = Chain.getValue(1); 1479 1480 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1481 // than 32-bits are sign extended or zero extended, depending on 1482 // whether they are signed or unsigned types. This case applies 1483 // only to scalar parameters and not to aggregate values. 1484 bool ExtendIntegerParam = 1485 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1486 1487 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 1488 SmallVector<SDValue, 6> StoreOperands; 1489 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1490 // New store. 1491 if (VectorInfo[j] & PVF_FIRST) { 1492 assert(StoreOperands.empty() && "Unfinished preceding store."); 1493 StoreOperands.push_back(Chain); 1494 StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32)); 1495 StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32)); 1496 } 1497 1498 EVT EltVT = VTs[j]; 1499 SDValue StVal = OutVals[OIdx]; 1500 if (ExtendIntegerParam) { 1501 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1502 // zext/sext to i32 1503 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1504 : ISD::ZERO_EXTEND, 1505 dl, MVT::i32, StVal); 1506 } else if (EltVT.getSizeInBits() < 16) { 1507 // Use 16-bit registers for small stores as it's the 1508 // smallest general purpose register size supported by NVPTX. 1509 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1510 } 1511 1512 // Record the value to store. 1513 StoreOperands.push_back(StVal); 1514 1515 if (VectorInfo[j] & PVF_LAST) { 1516 unsigned NumElts = StoreOperands.size() - 3; 1517 NVPTXISD::NodeType Op; 1518 switch (NumElts) { 1519 case 1: 1520 Op = NVPTXISD::StoreParam; 1521 break; 1522 case 2: 1523 Op = NVPTXISD::StoreParamV2; 1524 break; 1525 case 4: 1526 Op = NVPTXISD::StoreParamV4; 1527 break; 1528 default: 1529 llvm_unreachable("Invalid vector info."); 1530 } 1531 1532 StoreOperands.push_back(InFlag); 1533 1534 // Adjust type of the store op if we've extended the scalar 1535 // return value. 1536 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j]; 1537 MaybeAlign EltAlign; 1538 if (NeedAlign) 1539 EltAlign = commonAlignment(ArgAlign, Offsets[j]); 1540 1541 Chain = DAG.getMemIntrinsicNode( 1542 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1543 TheStoreType, MachinePointerInfo(), EltAlign, 1544 MachineMemOperand::MOStore); 1545 InFlag = Chain.getValue(1); 1546 1547 // Cleanup. 1548 StoreOperands.clear(); 1549 } 1550 ++OIdx; 1551 } 1552 assert(StoreOperands.empty() && "Unfinished parameter store."); 1553 if (VTs.size() > 0) 1554 --OIdx; 1555 ++paramCount; 1556 continue; 1557 } 1558 1559 // ByVal arguments 1560 SmallVector<EVT, 16> VTs; 1561 SmallVector<uint64_t, 16> Offsets; 1562 auto *PTy = dyn_cast<PointerType>(Args[i].Ty); 1563 assert(PTy && "Type of a byval parameter should be pointer"); 1564 ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0); 1565 1566 // declare .param .align <align> .b8 .param<n>[<size>]; 1567 unsigned sz = Outs[OIdx].Flags.getByValSize(); 1568 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1569 Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1570 // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, 1571 // so we don't need to worry about natural alignment or not. 1572 // See TargetLowering::LowerCallTo(). 1573 1574 // Enforce minumum alignment of 4 to work around ptxas miscompile 1575 // for sm_50+. See corresponding alignment adjustment in 1576 // emitFunctionParamList() for details. 1577 if (ArgAlign < Align(4)) 1578 ArgAlign = Align(4); 1579 SDValue DeclareParamOps[] = { 1580 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1581 DAG.getConstant(paramCount, dl, MVT::i32), 1582 DAG.getConstant(sz, dl, MVT::i32), InFlag}; 1583 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1584 DeclareParamOps); 1585 InFlag = Chain.getValue(1); 1586 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1587 EVT elemtype = VTs[j]; 1588 int curOffset = Offsets[j]; 1589 unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset); 1590 auto PtrVT = getPointerTy(DL); 1591 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx], 1592 DAG.getConstant(curOffset, dl, PtrVT)); 1593 SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, 1594 MachinePointerInfo(), PartAlign); 1595 if (elemtype.getSizeInBits() < 16) { 1596 theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal); 1597 } 1598 SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1599 SDValue CopyParamOps[] = { Chain, 1600 DAG.getConstant(paramCount, dl, MVT::i32), 1601 DAG.getConstant(curOffset, dl, MVT::i32), 1602 theVal, InFlag }; 1603 Chain = DAG.getMemIntrinsicNode( 1604 NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, 1605 MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore); 1606 1607 InFlag = Chain.getValue(1); 1608 } 1609 ++paramCount; 1610 } 1611 1612 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1613 MaybeAlign retAlignment = None; 1614 1615 // Handle Result 1616 if (Ins.size() > 0) { 1617 SmallVector<EVT, 16> resvtparts; 1618 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1619 1620 // Declare 1621 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1622 // .param .b<size-in-bits> retval0 1623 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1624 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1625 // these three types to match the logic in 1626 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1627 // Plus, this behavior is consistent with nvcc's. 1628 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1629 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1630 // Scalar needs to be at least 32bit wide 1631 if (resultsz < 32) 1632 resultsz = 32; 1633 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1634 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1635 DAG.getConstant(resultsz, dl, MVT::i32), 1636 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1637 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1638 DeclareRetOps); 1639 InFlag = Chain.getValue(1); 1640 } else { 1641 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1642 assert(retAlignment && "retAlignment is guaranteed to be set"); 1643 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1644 SDValue DeclareRetOps[] = { 1645 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1646 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1647 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1648 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1649 DeclareRetOps); 1650 InFlag = Chain.getValue(1); 1651 } 1652 } 1653 1654 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1655 // between them we must rely on the call site value which is valid for 1656 // indirect calls but is always null for libcalls. 1657 bool isIndirectCall = !Func && CB; 1658 1659 if (isa<ExternalSymbolSDNode>(Callee)) { 1660 Function* CalleeFunc = nullptr; 1661 1662 // Try to find the callee in the current module. 1663 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1664 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1665 1666 // Set the "libcall callee" attribute to indicate that the function 1667 // must always have a declaration. 1668 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1669 } 1670 1671 if (isIndirectCall) { 1672 // This is indirect function call case : PTX requires a prototype of the 1673 // form 1674 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1675 // to be emitted, and the label has to used as the last arg of call 1676 // instruction. 1677 // The prototype is embedded in a string and put as the operand for a 1678 // CallPrototype SDNode which will print out to the value of the string. 1679 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1680 std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB); 1681 const char *ProtoStr = 1682 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1683 SDValue ProtoOps[] = { 1684 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1685 }; 1686 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1687 InFlag = Chain.getValue(1); 1688 } 1689 // Op to just print "call" 1690 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1691 SDValue PrintCallOps[] = { 1692 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1693 }; 1694 // We model convergent calls as separate opcodes. 1695 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1696 if (CLI.IsConvergent) 1697 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1698 : NVPTXISD::PrintConvergentCall; 1699 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1700 InFlag = Chain.getValue(1); 1701 1702 // Ops to print out the function name 1703 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1704 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1705 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1706 InFlag = Chain.getValue(1); 1707 1708 // Ops to print out the param list 1709 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1710 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1711 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1712 CallArgBeginOps); 1713 InFlag = Chain.getValue(1); 1714 1715 for (unsigned i = 0, e = paramCount; i != e; ++i) { 1716 unsigned opcode; 1717 if (i == (e - 1)) 1718 opcode = NVPTXISD::LastCallArg; 1719 else 1720 opcode = NVPTXISD::CallArg; 1721 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1722 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1723 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1724 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1725 InFlag = Chain.getValue(1); 1726 } 1727 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1728 SDValue CallArgEndOps[] = { Chain, 1729 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1730 InFlag }; 1731 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1732 InFlag = Chain.getValue(1); 1733 1734 if (isIndirectCall) { 1735 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1736 SDValue PrototypeOps[] = { Chain, 1737 DAG.getConstant(uniqueCallSite, dl, MVT::i32), 1738 InFlag }; 1739 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1740 InFlag = Chain.getValue(1); 1741 } 1742 1743 SmallVector<SDValue, 16> ProxyRegOps; 1744 SmallVector<Optional<MVT>, 16> ProxyRegTruncates; 1745 1746 // Generate loads from param memory/moves from registers for result 1747 if (Ins.size() > 0) { 1748 SmallVector<EVT, 16> VTs; 1749 SmallVector<uint64_t, 16> Offsets; 1750 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1751 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1752 1753 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1754 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1755 1756 SmallVector<EVT, 6> LoadVTs; 1757 int VecIdx = -1; // Index of the first element of the vector. 1758 1759 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1760 // 32-bits are sign extended or zero extended, depending on whether 1761 // they are signed or unsigned types. 1762 bool ExtendIntegerRetVal = 1763 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1764 1765 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1766 bool needTruncate = false; 1767 EVT TheLoadType = VTs[i]; 1768 EVT EltType = Ins[i].VT; 1769 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 1770 if (ExtendIntegerRetVal) { 1771 TheLoadType = MVT::i32; 1772 EltType = MVT::i32; 1773 needTruncate = true; 1774 } else if (TheLoadType.getSizeInBits() < 16) { 1775 if (VTs[i].isInteger()) 1776 needTruncate = true; 1777 EltType = MVT::i16; 1778 } 1779 1780 // Record index of the very first element of the vector. 1781 if (VectorInfo[i] & PVF_FIRST) { 1782 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1783 VecIdx = i; 1784 } 1785 1786 LoadVTs.push_back(EltType); 1787 1788 if (VectorInfo[i] & PVF_LAST) { 1789 unsigned NumElts = LoadVTs.size(); 1790 LoadVTs.push_back(MVT::Other); 1791 LoadVTs.push_back(MVT::Glue); 1792 NVPTXISD::NodeType Op; 1793 switch (NumElts) { 1794 case 1: 1795 Op = NVPTXISD::LoadParam; 1796 break; 1797 case 2: 1798 Op = NVPTXISD::LoadParamV2; 1799 break; 1800 case 4: 1801 Op = NVPTXISD::LoadParamV4; 1802 break; 1803 default: 1804 llvm_unreachable("Invalid vector info."); 1805 } 1806 1807 SDValue LoadOperands[] = { 1808 Chain, DAG.getConstant(1, dl, MVT::i32), 1809 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1810 SDValue RetVal = DAG.getMemIntrinsicNode( 1811 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1812 MachinePointerInfo(), EltAlign, 1813 MachineMemOperand::MOLoad); 1814 1815 for (unsigned j = 0; j < NumElts; ++j) { 1816 ProxyRegOps.push_back(RetVal.getValue(j)); 1817 1818 if (needTruncate) 1819 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT)); 1820 else 1821 ProxyRegTruncates.push_back(Optional<MVT>()); 1822 } 1823 1824 Chain = RetVal.getValue(NumElts); 1825 InFlag = RetVal.getValue(NumElts + 1); 1826 1827 // Cleanup 1828 VecIdx = -1; 1829 LoadVTs.clear(); 1830 } 1831 } 1832 } 1833 1834 Chain = DAG.getCALLSEQ_END(Chain, 1835 DAG.getIntPtrConstant(uniqueCallSite, dl, true), 1836 DAG.getIntPtrConstant(uniqueCallSite + 1, dl, 1837 true), 1838 InFlag, dl); 1839 InFlag = Chain.getValue(1); 1840 uniqueCallSite++; 1841 1842 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1843 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1844 // dangling. 1845 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1846 SDValue Ret = DAG.getNode( 1847 NVPTXISD::ProxyReg, dl, 1848 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1849 { Chain, ProxyRegOps[i], InFlag } 1850 ); 1851 1852 Chain = Ret.getValue(1); 1853 InFlag = Ret.getValue(2); 1854 1855 if (ProxyRegTruncates[i].hasValue()) { 1856 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret); 1857 } 1858 1859 InVals.push_back(Ret); 1860 } 1861 1862 // set isTailCall to false for now, until we figure out how to express 1863 // tail call optimization in PTX 1864 isTailCall = false; 1865 return Chain; 1866 } 1867 1868 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1869 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1870 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1871 SDValue 1872 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1873 SDNode *Node = Op.getNode(); 1874 SDLoc dl(Node); 1875 SmallVector<SDValue, 8> Ops; 1876 unsigned NumOperands = Node->getNumOperands(); 1877 for (unsigned i = 0; i < NumOperands; ++i) { 1878 SDValue SubOp = Node->getOperand(i); 1879 EVT VVT = SubOp.getNode()->getValueType(0); 1880 EVT EltVT = VVT.getVectorElementType(); 1881 unsigned NumSubElem = VVT.getVectorNumElements(); 1882 for (unsigned j = 0; j < NumSubElem; ++j) { 1883 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1884 DAG.getIntPtrConstant(j, dl))); 1885 } 1886 } 1887 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 1888 } 1889 1890 // We can init constant f16x2 with a single .b32 move. Normally it 1891 // would get lowered as two constant loads and vector-packing move. 1892 // mov.b16 %h1, 0x4000; 1893 // mov.b16 %h2, 0x3C00; 1894 // mov.b32 %hh2, {%h2, %h1}; 1895 // Instead we want just a constant move: 1896 // mov.b32 %hh2, 0x40003C00 1897 // 1898 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 1899 // generates good SASS in both cases. 1900 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 1901 SelectionDAG &DAG) const { 1902 //return Op; 1903 if (!(Op->getValueType(0) == MVT::v2f16 && 1904 isa<ConstantFPSDNode>(Op->getOperand(0)) && 1905 isa<ConstantFPSDNode>(Op->getOperand(1)))) 1906 return Op; 1907 1908 APInt E0 = 1909 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 1910 APInt E1 = 1911 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 1912 SDValue Const = 1913 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 1914 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 1915 } 1916 1917 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 1918 SelectionDAG &DAG) const { 1919 SDValue Index = Op->getOperand(1); 1920 // Constant index will be matched by tablegen. 1921 if (isa<ConstantSDNode>(Index.getNode())) 1922 return Op; 1923 1924 // Extract individual elements and select one of them. 1925 SDValue Vector = Op->getOperand(0); 1926 EVT VectorVT = Vector.getValueType(); 1927 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 1928 EVT EltVT = VectorVT.getVectorElementType(); 1929 1930 SDLoc dl(Op.getNode()); 1931 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1932 DAG.getIntPtrConstant(0, dl)); 1933 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1934 DAG.getIntPtrConstant(1, dl)); 1935 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 1936 ISD::CondCode::SETEQ); 1937 } 1938 1939 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1940 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1941 /// amount, or 1942 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 1943 /// amount. 1944 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 1945 SelectionDAG &DAG) const { 1946 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 1947 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 1948 1949 EVT VT = Op.getValueType(); 1950 unsigned VTBits = VT.getSizeInBits(); 1951 SDLoc dl(Op); 1952 SDValue ShOpLo = Op.getOperand(0); 1953 SDValue ShOpHi = Op.getOperand(1); 1954 SDValue ShAmt = Op.getOperand(2); 1955 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 1956 1957 if (VTBits == 32 && STI.getSmVersion() >= 35) { 1958 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 1959 // {dHi, dLo} = {aHi, aLo} >> Amt 1960 // dHi = aHi >> Amt 1961 // dLo = shf.r.clamp aLo, aHi, Amt 1962 1963 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1964 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 1965 ShAmt); 1966 1967 SDValue Ops[2] = { Lo, Hi }; 1968 return DAG.getMergeValues(Ops, dl); 1969 } 1970 else { 1971 // {dHi, dLo} = {aHi, aLo} >> Amt 1972 // - if (Amt>=size) then 1973 // dLo = aHi >> (Amt-size) 1974 // dHi = aHi >> Amt (this is either all 0 or all 1) 1975 // else 1976 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 1977 // dHi = aHi >> Amt 1978 1979 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 1980 DAG.getConstant(VTBits, dl, MVT::i32), 1981 ShAmt); 1982 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 1983 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 1984 DAG.getConstant(VTBits, dl, MVT::i32)); 1985 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 1986 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 1987 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 1988 1989 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 1990 DAG.getConstant(VTBits, dl, MVT::i32), 1991 ISD::SETGE); 1992 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 1993 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 1994 1995 SDValue Ops[2] = { Lo, Hi }; 1996 return DAG.getMergeValues(Ops, dl); 1997 } 1998 } 1999 2000 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2001 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2002 /// amount, or 2003 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2004 /// amount. 2005 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2006 SelectionDAG &DAG) const { 2007 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2008 assert(Op.getOpcode() == ISD::SHL_PARTS); 2009 2010 EVT VT = Op.getValueType(); 2011 unsigned VTBits = VT.getSizeInBits(); 2012 SDLoc dl(Op); 2013 SDValue ShOpLo = Op.getOperand(0); 2014 SDValue ShOpHi = Op.getOperand(1); 2015 SDValue ShAmt = Op.getOperand(2); 2016 2017 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2018 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2019 // {dHi, dLo} = {aHi, aLo} << Amt 2020 // dHi = shf.l.clamp aLo, aHi, Amt 2021 // dLo = aLo << Amt 2022 2023 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2024 ShAmt); 2025 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2026 2027 SDValue Ops[2] = { Lo, Hi }; 2028 return DAG.getMergeValues(Ops, dl); 2029 } 2030 else { 2031 // {dHi, dLo} = {aHi, aLo} << Amt 2032 // - if (Amt>=size) then 2033 // dLo = aLo << Amt (all 0) 2034 // dLo = aLo << (Amt-size) 2035 // else 2036 // dLo = aLo << Amt 2037 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2038 2039 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2040 DAG.getConstant(VTBits, dl, MVT::i32), 2041 ShAmt); 2042 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2043 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2044 DAG.getConstant(VTBits, dl, MVT::i32)); 2045 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2046 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2047 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2048 2049 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2050 DAG.getConstant(VTBits, dl, MVT::i32), 2051 ISD::SETGE); 2052 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2053 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2054 2055 SDValue Ops[2] = { Lo, Hi }; 2056 return DAG.getMergeValues(Ops, dl); 2057 } 2058 } 2059 2060 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2061 EVT VT = Op.getValueType(); 2062 2063 if (VT == MVT::f32) 2064 return LowerFROUND32(Op, DAG); 2065 2066 if (VT == MVT::f64) 2067 return LowerFROUND64(Op, DAG); 2068 2069 llvm_unreachable("unhandled type"); 2070 } 2071 2072 // This is the the rounding method used in CUDA libdevice in C like code: 2073 // float roundf(float A) 2074 // { 2075 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2076 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2077 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2078 // } 2079 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2080 SelectionDAG &DAG) const { 2081 SDLoc SL(Op); 2082 SDValue A = Op.getOperand(0); 2083 EVT VT = Op.getValueType(); 2084 2085 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2086 2087 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2088 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2089 const int SignBitMask = 0x80000000; 2090 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2091 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2092 const int PointFiveInBits = 0x3F000000; 2093 SDValue PointFiveWithSignRaw = 2094 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2095 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2096 SDValue PointFiveWithSign = 2097 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2098 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2099 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2100 2101 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2102 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2103 SDValue IsLarge = 2104 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2105 ISD::SETOGT); 2106 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2107 2108 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2109 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2110 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2111 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2112 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2113 } 2114 2115 // The implementation of round(double) is similar to that of round(float) in 2116 // that they both separate the value range into three regions and use a method 2117 // specific to the region to round the values. However, round(double) first 2118 // calculates the round of the absolute value and then adds the sign back while 2119 // round(float) directly rounds the value with sign. 2120 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2121 SelectionDAG &DAG) const { 2122 SDLoc SL(Op); 2123 SDValue A = Op.getOperand(0); 2124 EVT VT = Op.getValueType(); 2125 2126 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2127 2128 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2129 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2130 DAG.getConstantFP(0.5, SL, VT)); 2131 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2132 2133 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2134 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2135 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2136 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2137 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2138 DAG.getConstantFP(0, SL, VT), 2139 RoundedA); 2140 2141 // Add sign to rounded_A 2142 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2143 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2144 2145 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2146 SDValue IsLarge = 2147 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2148 ISD::SETOGT); 2149 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2150 } 2151 2152 2153 2154 SDValue 2155 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2156 switch (Op.getOpcode()) { 2157 case ISD::RETURNADDR: 2158 return SDValue(); 2159 case ISD::FRAMEADDR: 2160 return SDValue(); 2161 case ISD::GlobalAddress: 2162 return LowerGlobalAddress(Op, DAG); 2163 case ISD::INTRINSIC_W_CHAIN: 2164 return Op; 2165 case ISD::BUILD_VECTOR: 2166 return LowerBUILD_VECTOR(Op, DAG); 2167 case ISD::EXTRACT_SUBVECTOR: 2168 return Op; 2169 case ISD::EXTRACT_VECTOR_ELT: 2170 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2171 case ISD::CONCAT_VECTORS: 2172 return LowerCONCAT_VECTORS(Op, DAG); 2173 case ISD::STORE: 2174 return LowerSTORE(Op, DAG); 2175 case ISD::LOAD: 2176 return LowerLOAD(Op, DAG); 2177 case ISD::SHL_PARTS: 2178 return LowerShiftLeftParts(Op, DAG); 2179 case ISD::SRA_PARTS: 2180 case ISD::SRL_PARTS: 2181 return LowerShiftRightParts(Op, DAG); 2182 case ISD::SELECT: 2183 return LowerSelect(Op, DAG); 2184 case ISD::FROUND: 2185 return LowerFROUND(Op, DAG); 2186 default: 2187 llvm_unreachable("Custom lowering not defined for operation"); 2188 } 2189 } 2190 2191 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2192 SDValue Op0 = Op->getOperand(0); 2193 SDValue Op1 = Op->getOperand(1); 2194 SDValue Op2 = Op->getOperand(2); 2195 SDLoc DL(Op.getNode()); 2196 2197 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2198 2199 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2200 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2201 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2202 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2203 2204 return Trunc; 2205 } 2206 2207 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2208 if (Op.getValueType() == MVT::i1) 2209 return LowerLOADi1(Op, DAG); 2210 2211 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2212 // loads and have to handle it here. 2213 if (Op.getValueType() == MVT::v2f16) { 2214 LoadSDNode *Load = cast<LoadSDNode>(Op); 2215 EVT MemVT = Load->getMemoryVT(); 2216 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2217 MemVT, *Load->getMemOperand())) { 2218 SDValue Ops[2]; 2219 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2220 return DAG.getMergeValues(Ops, SDLoc(Op)); 2221 } 2222 } 2223 2224 return SDValue(); 2225 } 2226 2227 // v = ld i1* addr 2228 // => 2229 // v1 = ld i8* addr (-> i16) 2230 // v = trunc i16 to i1 2231 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2232 SDNode *Node = Op.getNode(); 2233 LoadSDNode *LD = cast<LoadSDNode>(Node); 2234 SDLoc dl(Node); 2235 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2236 assert(Node->getValueType(0) == MVT::i1 && 2237 "Custom lowering for i1 load only"); 2238 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2239 LD->getPointerInfo(), LD->getAlignment(), 2240 LD->getMemOperand()->getFlags()); 2241 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2242 // The legalizer (the caller) is expecting two values from the legalized 2243 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2244 // in LegalizeDAG.cpp which also uses MergeValues. 2245 SDValue Ops[] = { result, LD->getChain() }; 2246 return DAG.getMergeValues(Ops, dl); 2247 } 2248 2249 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2250 StoreSDNode *Store = cast<StoreSDNode>(Op); 2251 EVT VT = Store->getMemoryVT(); 2252 2253 if (VT == MVT::i1) 2254 return LowerSTOREi1(Op, DAG); 2255 2256 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2257 // stores and have to handle it here. 2258 if (VT == MVT::v2f16 && 2259 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2260 VT, *Store->getMemOperand())) 2261 return expandUnalignedStore(Store, DAG); 2262 2263 if (VT.isVector()) 2264 return LowerSTOREVector(Op, DAG); 2265 2266 return SDValue(); 2267 } 2268 2269 SDValue 2270 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2271 SDNode *N = Op.getNode(); 2272 SDValue Val = N->getOperand(1); 2273 SDLoc DL(N); 2274 EVT ValVT = Val.getValueType(); 2275 2276 if (ValVT.isVector()) { 2277 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2278 // legal. We can (and should) split that into 2 stores of <2 x double> here 2279 // but I'm leaving that as a TODO for now. 2280 if (!ValVT.isSimple()) 2281 return SDValue(); 2282 switch (ValVT.getSimpleVT().SimpleTy) { 2283 default: 2284 return SDValue(); 2285 case MVT::v2i8: 2286 case MVT::v2i16: 2287 case MVT::v2i32: 2288 case MVT::v2i64: 2289 case MVT::v2f16: 2290 case MVT::v2f32: 2291 case MVT::v2f64: 2292 case MVT::v4i8: 2293 case MVT::v4i16: 2294 case MVT::v4i32: 2295 case MVT::v4f16: 2296 case MVT::v4f32: 2297 case MVT::v8f16: // <4 x f16x2> 2298 // This is a "native" vector type 2299 break; 2300 } 2301 2302 MemSDNode *MemSD = cast<MemSDNode>(N); 2303 const DataLayout &TD = DAG.getDataLayout(); 2304 2305 Align Alignment = MemSD->getAlign(); 2306 Align PrefAlign = 2307 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2308 if (Alignment < PrefAlign) { 2309 // This store is not sufficiently aligned, so bail out and let this vector 2310 // store be scalarized. Note that we may still be able to emit smaller 2311 // vector stores. For example, if we are storing a <4 x float> with an 2312 // alignment of 8, this check will fail but the legalizer will try again 2313 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2314 return SDValue(); 2315 } 2316 2317 unsigned Opcode = 0; 2318 EVT EltVT = ValVT.getVectorElementType(); 2319 unsigned NumElts = ValVT.getVectorNumElements(); 2320 2321 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2322 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2323 // stored type to i16 and propagate the "real" type as the memory type. 2324 bool NeedExt = false; 2325 if (EltVT.getSizeInBits() < 16) 2326 NeedExt = true; 2327 2328 bool StoreF16x2 = false; 2329 switch (NumElts) { 2330 default: 2331 return SDValue(); 2332 case 2: 2333 Opcode = NVPTXISD::StoreV2; 2334 break; 2335 case 4: 2336 Opcode = NVPTXISD::StoreV4; 2337 break; 2338 case 8: 2339 // v8f16 is a special case. PTX doesn't have st.v8.f16 2340 // instruction. Instead, we split the vector into v2f16 chunks and 2341 // store them with st.v4.b32. 2342 assert(EltVT == MVT::f16 && "Wrong type for the vector."); 2343 Opcode = NVPTXISD::StoreV4; 2344 StoreF16x2 = true; 2345 break; 2346 } 2347 2348 SmallVector<SDValue, 8> Ops; 2349 2350 // First is the chain 2351 Ops.push_back(N->getOperand(0)); 2352 2353 if (StoreF16x2) { 2354 // Combine f16,f16 -> v2f16 2355 NumElts /= 2; 2356 for (unsigned i = 0; i < NumElts; ++i) { 2357 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2358 DAG.getIntPtrConstant(i * 2, DL)); 2359 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2360 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2361 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2362 Ops.push_back(V2); 2363 } 2364 } else { 2365 // Then the split values 2366 for (unsigned i = 0; i < NumElts; ++i) { 2367 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2368 DAG.getIntPtrConstant(i, DL)); 2369 if (NeedExt) 2370 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2371 Ops.push_back(ExtVal); 2372 } 2373 } 2374 2375 // Then any remaining arguments 2376 Ops.append(N->op_begin() + 2, N->op_end()); 2377 2378 SDValue NewSt = 2379 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2380 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2381 2382 // return DCI.CombineTo(N, NewSt, true); 2383 return NewSt; 2384 } 2385 2386 return SDValue(); 2387 } 2388 2389 // st i1 v, addr 2390 // => 2391 // v1 = zxt v to i16 2392 // st.u8 i16, addr 2393 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2394 SDNode *Node = Op.getNode(); 2395 SDLoc dl(Node); 2396 StoreSDNode *ST = cast<StoreSDNode>(Node); 2397 SDValue Tmp1 = ST->getChain(); 2398 SDValue Tmp2 = ST->getBasePtr(); 2399 SDValue Tmp3 = ST->getValue(); 2400 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2401 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2402 SDValue Result = 2403 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2404 ST->getAlignment(), ST->getMemOperand()->getFlags()); 2405 return Result; 2406 } 2407 2408 SDValue 2409 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 2410 std::string ParamSym; 2411 raw_string_ostream ParamStr(ParamSym); 2412 2413 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 2414 ParamStr.flush(); 2415 2416 std::string *SavedStr = 2417 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 2418 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 2419 } 2420 2421 // Check to see if the kernel argument is image*_t or sampler_t 2422 2423 static bool isImageOrSamplerVal(const Value *arg, const Module *context) { 2424 static const char *const specialTypes[] = { "struct._image2d_t", 2425 "struct._image3d_t", 2426 "struct._sampler_t" }; 2427 2428 Type *Ty = arg->getType(); 2429 auto *PTy = dyn_cast<PointerType>(Ty); 2430 2431 if (!PTy) 2432 return false; 2433 2434 if (!context) 2435 return false; 2436 2437 auto *STy = dyn_cast<StructType>(PTy->getElementType()); 2438 if (!STy || STy->isLiteral()) 2439 return false; 2440 2441 return std::find(std::begin(specialTypes), std::end(specialTypes), 2442 STy->getName()) != std::end(specialTypes); 2443 } 2444 2445 SDValue NVPTXTargetLowering::LowerFormalArguments( 2446 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2447 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2448 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2449 MachineFunction &MF = DAG.getMachineFunction(); 2450 const DataLayout &DL = DAG.getDataLayout(); 2451 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2452 2453 const Function *F = &MF.getFunction(); 2454 const AttributeList &PAL = F->getAttributes(); 2455 const TargetLowering *TLI = STI.getTargetLowering(); 2456 2457 SDValue Root = DAG.getRoot(); 2458 std::vector<SDValue> OutChains; 2459 2460 bool isABI = (STI.getSmVersion() >= 20); 2461 assert(isABI && "Non-ABI compilation is not supported"); 2462 if (!isABI) 2463 return Chain; 2464 2465 std::vector<Type *> argTypes; 2466 std::vector<const Argument *> theArgs; 2467 for (const Argument &I : F->args()) { 2468 theArgs.push_back(&I); 2469 argTypes.push_back(I.getType()); 2470 } 2471 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2472 // Ins.size() will be larger 2473 // * if there is an aggregate argument with multiple fields (each field 2474 // showing up separately in Ins) 2475 // * if there is a vector argument with more than typical vector-length 2476 // elements (generally if more than 4) where each vector element is 2477 // individually present in Ins. 2478 // So a different index should be used for indexing into Ins. 2479 // See similar issue in LowerCall. 2480 unsigned InsIdx = 0; 2481 2482 int idx = 0; 2483 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2484 Type *Ty = argTypes[i]; 2485 2486 // If the kernel argument is image*_t or sampler_t, convert it to 2487 // a i32 constant holding the parameter position. This can later 2488 // matched in the AsmPrinter to output the correct mangled name. 2489 if (isImageOrSamplerVal( 2490 theArgs[i], 2491 (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() 2492 : nullptr))) { 2493 assert(isKernelFunction(*F) && 2494 "Only kernels can have image/sampler params"); 2495 InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); 2496 continue; 2497 } 2498 2499 if (theArgs[i]->use_empty()) { 2500 // argument is dead 2501 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2502 SmallVector<EVT, 16> vtparts; 2503 2504 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2505 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2506 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2507 ++parti) { 2508 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2509 ++InsIdx; 2510 } 2511 if (vtparts.size() > 0) 2512 --InsIdx; 2513 continue; 2514 } 2515 if (Ty->isVectorTy()) { 2516 EVT ObjectVT = getValueType(DL, Ty); 2517 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2518 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2519 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2520 ++InsIdx; 2521 } 2522 if (NumRegs > 0) 2523 --InsIdx; 2524 continue; 2525 } 2526 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2527 continue; 2528 } 2529 2530 // In the following cases, assign a node order of "idx+1" 2531 // to newly created nodes. The SDNodes for params have to 2532 // appear in the same order as their order of appearance 2533 // in the original function. "idx+1" holds that order. 2534 if (!PAL.hasParamAttribute(i, Attribute::ByVal)) { 2535 bool aggregateIsPacked = false; 2536 if (StructType *STy = dyn_cast<StructType>(Ty)) 2537 aggregateIsPacked = STy->isPacked(); 2538 2539 SmallVector<EVT, 16> VTs; 2540 SmallVector<uint64_t, 16> Offsets; 2541 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2542 assert(VTs.size() > 0 && "Unexpected empty type."); 2543 auto VectorInfo = 2544 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 2545 2546 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2547 int VecIdx = -1; // Index of the first element of the current vector. 2548 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2549 if (VectorInfo[parti] & PVF_FIRST) { 2550 assert(VecIdx == -1 && "Orphaned vector."); 2551 VecIdx = parti; 2552 } 2553 2554 // That's the last element of this store op. 2555 if (VectorInfo[parti] & PVF_LAST) { 2556 unsigned NumElts = parti - VecIdx + 1; 2557 EVT EltVT = VTs[parti]; 2558 // i1 is loaded/stored as i8. 2559 EVT LoadVT = EltVT; 2560 if (EltVT == MVT::i1) 2561 LoadVT = MVT::i8; 2562 else if (EltVT == MVT::v2f16) 2563 // getLoad needs a vector type, but it can't handle 2564 // vectors which contain v2f16 elements. So we must load 2565 // using i32 here and then bitcast back. 2566 LoadVT = MVT::i32; 2567 2568 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2569 SDValue VecAddr = 2570 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2571 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2572 Value *srcValue = Constant::getNullValue(PointerType::get( 2573 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2574 SDValue P = 2575 DAG.getLoad(VecVT, dl, Root, VecAddr, 2576 MachinePointerInfo(srcValue), aggregateIsPacked, 2577 MachineMemOperand::MODereferenceable | 2578 MachineMemOperand::MOInvariant); 2579 if (P.getNode()) 2580 P.getNode()->setIROrder(idx + 1); 2581 for (unsigned j = 0; j < NumElts; ++j) { 2582 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2583 DAG.getIntPtrConstant(j, dl)); 2584 // We've loaded i1 as an i8 and now must truncate it back to i1 2585 if (EltVT == MVT::i1) 2586 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2587 // v2f16 was loaded as an i32. Now we must bitcast it back. 2588 else if (EltVT == MVT::v2f16) 2589 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2590 // Extend the element if necessary (e.g. an i8 is loaded 2591 // into an i16 register) 2592 if (Ins[InsIdx].VT.isInteger() && 2593 Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) { 2594 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2595 : ISD::ZERO_EXTEND; 2596 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2597 } 2598 InVals.push_back(Elt); 2599 } 2600 2601 // Reset vector tracking state. 2602 VecIdx = -1; 2603 } 2604 ++InsIdx; 2605 } 2606 if (VTs.size() > 0) 2607 --InsIdx; 2608 continue; 2609 } 2610 2611 // Param has ByVal attribute 2612 // Return MoveParam(param symbol). 2613 // Ideally, the param symbol can be returned directly, 2614 // but when SDNode builder decides to use it in a CopyToReg(), 2615 // machine instruction fails because TargetExternalSymbol 2616 // (not lowered) is target dependent, and CopyToReg assumes 2617 // the source is lowered. 2618 EVT ObjectVT = getValueType(DL, Ty); 2619 assert(ObjectVT == Ins[InsIdx].VT && 2620 "Ins type did not match function type"); 2621 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2622 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2623 if (p.getNode()) 2624 p.getNode()->setIROrder(idx + 1); 2625 InVals.push_back(p); 2626 } 2627 2628 // Clang will check explicit VarArg and issue error if any. However, Clang 2629 // will let code with 2630 // implicit var arg like f() pass. See bug 617733. 2631 // We treat this case as if the arg list is empty. 2632 // if (F.isVarArg()) { 2633 // assert(0 && "VarArg not supported yet!"); 2634 //} 2635 2636 if (!OutChains.empty()) 2637 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2638 2639 return Chain; 2640 } 2641 2642 SDValue 2643 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2644 bool isVarArg, 2645 const SmallVectorImpl<ISD::OutputArg> &Outs, 2646 const SmallVectorImpl<SDValue> &OutVals, 2647 const SDLoc &dl, SelectionDAG &DAG) const { 2648 MachineFunction &MF = DAG.getMachineFunction(); 2649 Type *RetTy = MF.getFunction().getReturnType(); 2650 2651 bool isABI = (STI.getSmVersion() >= 20); 2652 assert(isABI && "Non-ABI compilation is not supported"); 2653 if (!isABI) 2654 return Chain; 2655 2656 const DataLayout DL = DAG.getDataLayout(); 2657 SmallVector<EVT, 16> VTs; 2658 SmallVector<uint64_t, 16> Offsets; 2659 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2660 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2661 2662 auto VectorInfo = VectorizePTXValueVTs( 2663 VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1)); 2664 2665 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2666 // 32-bits are sign extended or zero extended, depending on whether 2667 // they are signed or unsigned types. 2668 bool ExtendIntegerRetVal = 2669 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2670 2671 SmallVector<SDValue, 6> StoreOperands; 2672 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2673 // New load/store. Record chain and offset operands. 2674 if (VectorInfo[i] & PVF_FIRST) { 2675 assert(StoreOperands.empty() && "Orphaned operand list."); 2676 StoreOperands.push_back(Chain); 2677 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2678 } 2679 2680 SDValue RetVal = OutVals[i]; 2681 if (ExtendIntegerRetVal) { 2682 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2683 : ISD::ZERO_EXTEND, 2684 dl, MVT::i32, RetVal); 2685 } else if (RetVal.getValueSizeInBits() < 16) { 2686 // Use 16-bit registers for small load-stores as it's the 2687 // smallest general purpose register size supported by NVPTX. 2688 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2689 } 2690 2691 // Record the value to return. 2692 StoreOperands.push_back(RetVal); 2693 2694 // That's the last element of this store op. 2695 if (VectorInfo[i] & PVF_LAST) { 2696 NVPTXISD::NodeType Op; 2697 unsigned NumElts = StoreOperands.size() - 2; 2698 switch (NumElts) { 2699 case 1: 2700 Op = NVPTXISD::StoreRetval; 2701 break; 2702 case 2: 2703 Op = NVPTXISD::StoreRetvalV2; 2704 break; 2705 case 4: 2706 Op = NVPTXISD::StoreRetvalV4; 2707 break; 2708 default: 2709 llvm_unreachable("Invalid vector info."); 2710 } 2711 2712 // Adjust type of load/store op if we've extended the scalar 2713 // return value. 2714 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2715 Chain = DAG.getMemIntrinsicNode( 2716 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 2717 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 2718 // Cleanup vector state. 2719 StoreOperands.clear(); 2720 } 2721 } 2722 2723 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2724 } 2725 2726 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2727 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2728 SelectionDAG &DAG) const { 2729 if (Constraint.length() > 1) 2730 return; 2731 else 2732 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2733 } 2734 2735 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2736 switch (Intrinsic) { 2737 default: 2738 return 0; 2739 2740 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2741 return NVPTXISD::Tex1DFloatS32; 2742 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2743 return NVPTXISD::Tex1DFloatFloat; 2744 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2745 return NVPTXISD::Tex1DFloatFloatLevel; 2746 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2747 return NVPTXISD::Tex1DFloatFloatGrad; 2748 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2749 return NVPTXISD::Tex1DS32S32; 2750 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2751 return NVPTXISD::Tex1DS32Float; 2752 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2753 return NVPTXISD::Tex1DS32FloatLevel; 2754 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2755 return NVPTXISD::Tex1DS32FloatGrad; 2756 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2757 return NVPTXISD::Tex1DU32S32; 2758 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2759 return NVPTXISD::Tex1DU32Float; 2760 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2761 return NVPTXISD::Tex1DU32FloatLevel; 2762 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2763 return NVPTXISD::Tex1DU32FloatGrad; 2764 2765 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2766 return NVPTXISD::Tex1DArrayFloatS32; 2767 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2768 return NVPTXISD::Tex1DArrayFloatFloat; 2769 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2770 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2771 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2772 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2773 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2774 return NVPTXISD::Tex1DArrayS32S32; 2775 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2776 return NVPTXISD::Tex1DArrayS32Float; 2777 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2778 return NVPTXISD::Tex1DArrayS32FloatLevel; 2779 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2780 return NVPTXISD::Tex1DArrayS32FloatGrad; 2781 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2782 return NVPTXISD::Tex1DArrayU32S32; 2783 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2784 return NVPTXISD::Tex1DArrayU32Float; 2785 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2786 return NVPTXISD::Tex1DArrayU32FloatLevel; 2787 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2788 return NVPTXISD::Tex1DArrayU32FloatGrad; 2789 2790 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2791 return NVPTXISD::Tex2DFloatS32; 2792 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2793 return NVPTXISD::Tex2DFloatFloat; 2794 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2795 return NVPTXISD::Tex2DFloatFloatLevel; 2796 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2797 return NVPTXISD::Tex2DFloatFloatGrad; 2798 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2799 return NVPTXISD::Tex2DS32S32; 2800 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2801 return NVPTXISD::Tex2DS32Float; 2802 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2803 return NVPTXISD::Tex2DS32FloatLevel; 2804 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2805 return NVPTXISD::Tex2DS32FloatGrad; 2806 case Intrinsic::nvvm_tex_2d_v4u32_s32: 2807 return NVPTXISD::Tex2DU32S32; 2808 case Intrinsic::nvvm_tex_2d_v4u32_f32: 2809 return NVPTXISD::Tex2DU32Float; 2810 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 2811 return NVPTXISD::Tex2DU32FloatLevel; 2812 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 2813 return NVPTXISD::Tex2DU32FloatGrad; 2814 2815 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 2816 return NVPTXISD::Tex2DArrayFloatS32; 2817 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2818 return NVPTXISD::Tex2DArrayFloatFloat; 2819 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2820 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2821 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2822 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2823 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 2824 return NVPTXISD::Tex2DArrayS32S32; 2825 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 2826 return NVPTXISD::Tex2DArrayS32Float; 2827 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 2828 return NVPTXISD::Tex2DArrayS32FloatLevel; 2829 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 2830 return NVPTXISD::Tex2DArrayS32FloatGrad; 2831 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 2832 return NVPTXISD::Tex2DArrayU32S32; 2833 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 2834 return NVPTXISD::Tex2DArrayU32Float; 2835 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 2836 return NVPTXISD::Tex2DArrayU32FloatLevel; 2837 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 2838 return NVPTXISD::Tex2DArrayU32FloatGrad; 2839 2840 case Intrinsic::nvvm_tex_3d_v4f32_s32: 2841 return NVPTXISD::Tex3DFloatS32; 2842 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2843 return NVPTXISD::Tex3DFloatFloat; 2844 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2845 return NVPTXISD::Tex3DFloatFloatLevel; 2846 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2847 return NVPTXISD::Tex3DFloatFloatGrad; 2848 case Intrinsic::nvvm_tex_3d_v4s32_s32: 2849 return NVPTXISD::Tex3DS32S32; 2850 case Intrinsic::nvvm_tex_3d_v4s32_f32: 2851 return NVPTXISD::Tex3DS32Float; 2852 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 2853 return NVPTXISD::Tex3DS32FloatLevel; 2854 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 2855 return NVPTXISD::Tex3DS32FloatGrad; 2856 case Intrinsic::nvvm_tex_3d_v4u32_s32: 2857 return NVPTXISD::Tex3DU32S32; 2858 case Intrinsic::nvvm_tex_3d_v4u32_f32: 2859 return NVPTXISD::Tex3DU32Float; 2860 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 2861 return NVPTXISD::Tex3DU32FloatLevel; 2862 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 2863 return NVPTXISD::Tex3DU32FloatGrad; 2864 2865 case Intrinsic::nvvm_tex_cube_v4f32_f32: 2866 return NVPTXISD::TexCubeFloatFloat; 2867 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 2868 return NVPTXISD::TexCubeFloatFloatLevel; 2869 case Intrinsic::nvvm_tex_cube_v4s32_f32: 2870 return NVPTXISD::TexCubeS32Float; 2871 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 2872 return NVPTXISD::TexCubeS32FloatLevel; 2873 case Intrinsic::nvvm_tex_cube_v4u32_f32: 2874 return NVPTXISD::TexCubeU32Float; 2875 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 2876 return NVPTXISD::TexCubeU32FloatLevel; 2877 2878 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 2879 return NVPTXISD::TexCubeArrayFloatFloat; 2880 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 2881 return NVPTXISD::TexCubeArrayFloatFloatLevel; 2882 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 2883 return NVPTXISD::TexCubeArrayS32Float; 2884 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 2885 return NVPTXISD::TexCubeArrayS32FloatLevel; 2886 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 2887 return NVPTXISD::TexCubeArrayU32Float; 2888 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 2889 return NVPTXISD::TexCubeArrayU32FloatLevel; 2890 2891 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 2892 return NVPTXISD::Tld4R2DFloatFloat; 2893 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 2894 return NVPTXISD::Tld4G2DFloatFloat; 2895 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 2896 return NVPTXISD::Tld4B2DFloatFloat; 2897 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 2898 return NVPTXISD::Tld4A2DFloatFloat; 2899 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 2900 return NVPTXISD::Tld4R2DS64Float; 2901 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 2902 return NVPTXISD::Tld4G2DS64Float; 2903 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 2904 return NVPTXISD::Tld4B2DS64Float; 2905 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 2906 return NVPTXISD::Tld4A2DS64Float; 2907 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 2908 return NVPTXISD::Tld4R2DU64Float; 2909 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 2910 return NVPTXISD::Tld4G2DU64Float; 2911 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 2912 return NVPTXISD::Tld4B2DU64Float; 2913 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 2914 return NVPTXISD::Tld4A2DU64Float; 2915 2916 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 2917 return NVPTXISD::TexUnified1DFloatS32; 2918 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 2919 return NVPTXISD::TexUnified1DFloatFloat; 2920 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 2921 return NVPTXISD::TexUnified1DFloatFloatLevel; 2922 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 2923 return NVPTXISD::TexUnified1DFloatFloatGrad; 2924 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 2925 return NVPTXISD::TexUnified1DS32S32; 2926 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 2927 return NVPTXISD::TexUnified1DS32Float; 2928 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 2929 return NVPTXISD::TexUnified1DS32FloatLevel; 2930 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 2931 return NVPTXISD::TexUnified1DS32FloatGrad; 2932 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 2933 return NVPTXISD::TexUnified1DU32S32; 2934 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 2935 return NVPTXISD::TexUnified1DU32Float; 2936 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 2937 return NVPTXISD::TexUnified1DU32FloatLevel; 2938 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 2939 return NVPTXISD::TexUnified1DU32FloatGrad; 2940 2941 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 2942 return NVPTXISD::TexUnified1DArrayFloatS32; 2943 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 2944 return NVPTXISD::TexUnified1DArrayFloatFloat; 2945 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 2946 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 2947 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 2948 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 2949 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 2950 return NVPTXISD::TexUnified1DArrayS32S32; 2951 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 2952 return NVPTXISD::TexUnified1DArrayS32Float; 2953 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 2954 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 2955 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 2956 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 2957 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 2958 return NVPTXISD::TexUnified1DArrayU32S32; 2959 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 2960 return NVPTXISD::TexUnified1DArrayU32Float; 2961 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 2962 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 2963 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 2964 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 2965 2966 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 2967 return NVPTXISD::TexUnified2DFloatS32; 2968 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 2969 return NVPTXISD::TexUnified2DFloatFloat; 2970 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 2971 return NVPTXISD::TexUnified2DFloatFloatLevel; 2972 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 2973 return NVPTXISD::TexUnified2DFloatFloatGrad; 2974 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 2975 return NVPTXISD::TexUnified2DS32S32; 2976 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 2977 return NVPTXISD::TexUnified2DS32Float; 2978 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 2979 return NVPTXISD::TexUnified2DS32FloatLevel; 2980 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 2981 return NVPTXISD::TexUnified2DS32FloatGrad; 2982 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 2983 return NVPTXISD::TexUnified2DU32S32; 2984 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 2985 return NVPTXISD::TexUnified2DU32Float; 2986 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 2987 return NVPTXISD::TexUnified2DU32FloatLevel; 2988 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 2989 return NVPTXISD::TexUnified2DU32FloatGrad; 2990 2991 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 2992 return NVPTXISD::TexUnified2DArrayFloatS32; 2993 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 2994 return NVPTXISD::TexUnified2DArrayFloatFloat; 2995 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 2996 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 2997 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 2998 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 2999 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3000 return NVPTXISD::TexUnified2DArrayS32S32; 3001 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3002 return NVPTXISD::TexUnified2DArrayS32Float; 3003 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3004 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3005 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3006 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3007 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3008 return NVPTXISD::TexUnified2DArrayU32S32; 3009 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3010 return NVPTXISD::TexUnified2DArrayU32Float; 3011 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3012 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3013 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3014 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3015 3016 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3017 return NVPTXISD::TexUnified3DFloatS32; 3018 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3019 return NVPTXISD::TexUnified3DFloatFloat; 3020 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3021 return NVPTXISD::TexUnified3DFloatFloatLevel; 3022 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3023 return NVPTXISD::TexUnified3DFloatFloatGrad; 3024 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3025 return NVPTXISD::TexUnified3DS32S32; 3026 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3027 return NVPTXISD::TexUnified3DS32Float; 3028 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3029 return NVPTXISD::TexUnified3DS32FloatLevel; 3030 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3031 return NVPTXISD::TexUnified3DS32FloatGrad; 3032 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3033 return NVPTXISD::TexUnified3DU32S32; 3034 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3035 return NVPTXISD::TexUnified3DU32Float; 3036 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3037 return NVPTXISD::TexUnified3DU32FloatLevel; 3038 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3039 return NVPTXISD::TexUnified3DU32FloatGrad; 3040 3041 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3042 return NVPTXISD::TexUnifiedCubeFloatFloat; 3043 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3044 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3045 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3046 return NVPTXISD::TexUnifiedCubeS32Float; 3047 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3048 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3049 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3050 return NVPTXISD::TexUnifiedCubeU32Float; 3051 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3052 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3053 3054 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3055 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3056 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3057 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3058 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3059 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3060 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3061 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3062 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3063 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3064 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3065 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3066 3067 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3068 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3069 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3070 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3071 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3072 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3073 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3074 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3075 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3076 return NVPTXISD::Tld4UnifiedR2DS64Float; 3077 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3078 return NVPTXISD::Tld4UnifiedG2DS64Float; 3079 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3080 return NVPTXISD::Tld4UnifiedB2DS64Float; 3081 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3082 return NVPTXISD::Tld4UnifiedA2DS64Float; 3083 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3084 return NVPTXISD::Tld4UnifiedR2DU64Float; 3085 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3086 return NVPTXISD::Tld4UnifiedG2DU64Float; 3087 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3088 return NVPTXISD::Tld4UnifiedB2DU64Float; 3089 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3090 return NVPTXISD::Tld4UnifiedA2DU64Float; 3091 } 3092 } 3093 3094 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3095 switch (Intrinsic) { 3096 default: 3097 return 0; 3098 case Intrinsic::nvvm_suld_1d_i8_clamp: 3099 return NVPTXISD::Suld1DI8Clamp; 3100 case Intrinsic::nvvm_suld_1d_i16_clamp: 3101 return NVPTXISD::Suld1DI16Clamp; 3102 case Intrinsic::nvvm_suld_1d_i32_clamp: 3103 return NVPTXISD::Suld1DI32Clamp; 3104 case Intrinsic::nvvm_suld_1d_i64_clamp: 3105 return NVPTXISD::Suld1DI64Clamp; 3106 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3107 return NVPTXISD::Suld1DV2I8Clamp; 3108 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3109 return NVPTXISD::Suld1DV2I16Clamp; 3110 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3111 return NVPTXISD::Suld1DV2I32Clamp; 3112 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3113 return NVPTXISD::Suld1DV2I64Clamp; 3114 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3115 return NVPTXISD::Suld1DV4I8Clamp; 3116 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3117 return NVPTXISD::Suld1DV4I16Clamp; 3118 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3119 return NVPTXISD::Suld1DV4I32Clamp; 3120 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3121 return NVPTXISD::Suld1DArrayI8Clamp; 3122 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3123 return NVPTXISD::Suld1DArrayI16Clamp; 3124 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3125 return NVPTXISD::Suld1DArrayI32Clamp; 3126 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3127 return NVPTXISD::Suld1DArrayI64Clamp; 3128 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3129 return NVPTXISD::Suld1DArrayV2I8Clamp; 3130 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3131 return NVPTXISD::Suld1DArrayV2I16Clamp; 3132 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3133 return NVPTXISD::Suld1DArrayV2I32Clamp; 3134 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3135 return NVPTXISD::Suld1DArrayV2I64Clamp; 3136 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3137 return NVPTXISD::Suld1DArrayV4I8Clamp; 3138 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3139 return NVPTXISD::Suld1DArrayV4I16Clamp; 3140 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3141 return NVPTXISD::Suld1DArrayV4I32Clamp; 3142 case Intrinsic::nvvm_suld_2d_i8_clamp: 3143 return NVPTXISD::Suld2DI8Clamp; 3144 case Intrinsic::nvvm_suld_2d_i16_clamp: 3145 return NVPTXISD::Suld2DI16Clamp; 3146 case Intrinsic::nvvm_suld_2d_i32_clamp: 3147 return NVPTXISD::Suld2DI32Clamp; 3148 case Intrinsic::nvvm_suld_2d_i64_clamp: 3149 return NVPTXISD::Suld2DI64Clamp; 3150 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3151 return NVPTXISD::Suld2DV2I8Clamp; 3152 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3153 return NVPTXISD::Suld2DV2I16Clamp; 3154 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3155 return NVPTXISD::Suld2DV2I32Clamp; 3156 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3157 return NVPTXISD::Suld2DV2I64Clamp; 3158 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3159 return NVPTXISD::Suld2DV4I8Clamp; 3160 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3161 return NVPTXISD::Suld2DV4I16Clamp; 3162 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3163 return NVPTXISD::Suld2DV4I32Clamp; 3164 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3165 return NVPTXISD::Suld2DArrayI8Clamp; 3166 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3167 return NVPTXISD::Suld2DArrayI16Clamp; 3168 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3169 return NVPTXISD::Suld2DArrayI32Clamp; 3170 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3171 return NVPTXISD::Suld2DArrayI64Clamp; 3172 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3173 return NVPTXISD::Suld2DArrayV2I8Clamp; 3174 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3175 return NVPTXISD::Suld2DArrayV2I16Clamp; 3176 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3177 return NVPTXISD::Suld2DArrayV2I32Clamp; 3178 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3179 return NVPTXISD::Suld2DArrayV2I64Clamp; 3180 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3181 return NVPTXISD::Suld2DArrayV4I8Clamp; 3182 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3183 return NVPTXISD::Suld2DArrayV4I16Clamp; 3184 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3185 return NVPTXISD::Suld2DArrayV4I32Clamp; 3186 case Intrinsic::nvvm_suld_3d_i8_clamp: 3187 return NVPTXISD::Suld3DI8Clamp; 3188 case Intrinsic::nvvm_suld_3d_i16_clamp: 3189 return NVPTXISD::Suld3DI16Clamp; 3190 case Intrinsic::nvvm_suld_3d_i32_clamp: 3191 return NVPTXISD::Suld3DI32Clamp; 3192 case Intrinsic::nvvm_suld_3d_i64_clamp: 3193 return NVPTXISD::Suld3DI64Clamp; 3194 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3195 return NVPTXISD::Suld3DV2I8Clamp; 3196 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3197 return NVPTXISD::Suld3DV2I16Clamp; 3198 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3199 return NVPTXISD::Suld3DV2I32Clamp; 3200 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3201 return NVPTXISD::Suld3DV2I64Clamp; 3202 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3203 return NVPTXISD::Suld3DV4I8Clamp; 3204 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3205 return NVPTXISD::Suld3DV4I16Clamp; 3206 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3207 return NVPTXISD::Suld3DV4I32Clamp; 3208 case Intrinsic::nvvm_suld_1d_i8_trap: 3209 return NVPTXISD::Suld1DI8Trap; 3210 case Intrinsic::nvvm_suld_1d_i16_trap: 3211 return NVPTXISD::Suld1DI16Trap; 3212 case Intrinsic::nvvm_suld_1d_i32_trap: 3213 return NVPTXISD::Suld1DI32Trap; 3214 case Intrinsic::nvvm_suld_1d_i64_trap: 3215 return NVPTXISD::Suld1DI64Trap; 3216 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3217 return NVPTXISD::Suld1DV2I8Trap; 3218 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3219 return NVPTXISD::Suld1DV2I16Trap; 3220 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3221 return NVPTXISD::Suld1DV2I32Trap; 3222 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3223 return NVPTXISD::Suld1DV2I64Trap; 3224 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3225 return NVPTXISD::Suld1DV4I8Trap; 3226 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3227 return NVPTXISD::Suld1DV4I16Trap; 3228 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3229 return NVPTXISD::Suld1DV4I32Trap; 3230 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3231 return NVPTXISD::Suld1DArrayI8Trap; 3232 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3233 return NVPTXISD::Suld1DArrayI16Trap; 3234 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3235 return NVPTXISD::Suld1DArrayI32Trap; 3236 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3237 return NVPTXISD::Suld1DArrayI64Trap; 3238 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3239 return NVPTXISD::Suld1DArrayV2I8Trap; 3240 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3241 return NVPTXISD::Suld1DArrayV2I16Trap; 3242 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3243 return NVPTXISD::Suld1DArrayV2I32Trap; 3244 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3245 return NVPTXISD::Suld1DArrayV2I64Trap; 3246 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3247 return NVPTXISD::Suld1DArrayV4I8Trap; 3248 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3249 return NVPTXISD::Suld1DArrayV4I16Trap; 3250 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3251 return NVPTXISD::Suld1DArrayV4I32Trap; 3252 case Intrinsic::nvvm_suld_2d_i8_trap: 3253 return NVPTXISD::Suld2DI8Trap; 3254 case Intrinsic::nvvm_suld_2d_i16_trap: 3255 return NVPTXISD::Suld2DI16Trap; 3256 case Intrinsic::nvvm_suld_2d_i32_trap: 3257 return NVPTXISD::Suld2DI32Trap; 3258 case Intrinsic::nvvm_suld_2d_i64_trap: 3259 return NVPTXISD::Suld2DI64Trap; 3260 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3261 return NVPTXISD::Suld2DV2I8Trap; 3262 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3263 return NVPTXISD::Suld2DV2I16Trap; 3264 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3265 return NVPTXISD::Suld2DV2I32Trap; 3266 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3267 return NVPTXISD::Suld2DV2I64Trap; 3268 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3269 return NVPTXISD::Suld2DV4I8Trap; 3270 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3271 return NVPTXISD::Suld2DV4I16Trap; 3272 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3273 return NVPTXISD::Suld2DV4I32Trap; 3274 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3275 return NVPTXISD::Suld2DArrayI8Trap; 3276 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3277 return NVPTXISD::Suld2DArrayI16Trap; 3278 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3279 return NVPTXISD::Suld2DArrayI32Trap; 3280 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3281 return NVPTXISD::Suld2DArrayI64Trap; 3282 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3283 return NVPTXISD::Suld2DArrayV2I8Trap; 3284 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3285 return NVPTXISD::Suld2DArrayV2I16Trap; 3286 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3287 return NVPTXISD::Suld2DArrayV2I32Trap; 3288 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3289 return NVPTXISD::Suld2DArrayV2I64Trap; 3290 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3291 return NVPTXISD::Suld2DArrayV4I8Trap; 3292 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3293 return NVPTXISD::Suld2DArrayV4I16Trap; 3294 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3295 return NVPTXISD::Suld2DArrayV4I32Trap; 3296 case Intrinsic::nvvm_suld_3d_i8_trap: 3297 return NVPTXISD::Suld3DI8Trap; 3298 case Intrinsic::nvvm_suld_3d_i16_trap: 3299 return NVPTXISD::Suld3DI16Trap; 3300 case Intrinsic::nvvm_suld_3d_i32_trap: 3301 return NVPTXISD::Suld3DI32Trap; 3302 case Intrinsic::nvvm_suld_3d_i64_trap: 3303 return NVPTXISD::Suld3DI64Trap; 3304 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3305 return NVPTXISD::Suld3DV2I8Trap; 3306 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3307 return NVPTXISD::Suld3DV2I16Trap; 3308 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3309 return NVPTXISD::Suld3DV2I32Trap; 3310 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3311 return NVPTXISD::Suld3DV2I64Trap; 3312 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3313 return NVPTXISD::Suld3DV4I8Trap; 3314 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3315 return NVPTXISD::Suld3DV4I16Trap; 3316 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3317 return NVPTXISD::Suld3DV4I32Trap; 3318 case Intrinsic::nvvm_suld_1d_i8_zero: 3319 return NVPTXISD::Suld1DI8Zero; 3320 case Intrinsic::nvvm_suld_1d_i16_zero: 3321 return NVPTXISD::Suld1DI16Zero; 3322 case Intrinsic::nvvm_suld_1d_i32_zero: 3323 return NVPTXISD::Suld1DI32Zero; 3324 case Intrinsic::nvvm_suld_1d_i64_zero: 3325 return NVPTXISD::Suld1DI64Zero; 3326 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3327 return NVPTXISD::Suld1DV2I8Zero; 3328 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3329 return NVPTXISD::Suld1DV2I16Zero; 3330 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3331 return NVPTXISD::Suld1DV2I32Zero; 3332 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3333 return NVPTXISD::Suld1DV2I64Zero; 3334 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3335 return NVPTXISD::Suld1DV4I8Zero; 3336 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3337 return NVPTXISD::Suld1DV4I16Zero; 3338 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3339 return NVPTXISD::Suld1DV4I32Zero; 3340 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3341 return NVPTXISD::Suld1DArrayI8Zero; 3342 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3343 return NVPTXISD::Suld1DArrayI16Zero; 3344 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3345 return NVPTXISD::Suld1DArrayI32Zero; 3346 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3347 return NVPTXISD::Suld1DArrayI64Zero; 3348 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3349 return NVPTXISD::Suld1DArrayV2I8Zero; 3350 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3351 return NVPTXISD::Suld1DArrayV2I16Zero; 3352 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3353 return NVPTXISD::Suld1DArrayV2I32Zero; 3354 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3355 return NVPTXISD::Suld1DArrayV2I64Zero; 3356 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3357 return NVPTXISD::Suld1DArrayV4I8Zero; 3358 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3359 return NVPTXISD::Suld1DArrayV4I16Zero; 3360 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3361 return NVPTXISD::Suld1DArrayV4I32Zero; 3362 case Intrinsic::nvvm_suld_2d_i8_zero: 3363 return NVPTXISD::Suld2DI8Zero; 3364 case Intrinsic::nvvm_suld_2d_i16_zero: 3365 return NVPTXISD::Suld2DI16Zero; 3366 case Intrinsic::nvvm_suld_2d_i32_zero: 3367 return NVPTXISD::Suld2DI32Zero; 3368 case Intrinsic::nvvm_suld_2d_i64_zero: 3369 return NVPTXISD::Suld2DI64Zero; 3370 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3371 return NVPTXISD::Suld2DV2I8Zero; 3372 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3373 return NVPTXISD::Suld2DV2I16Zero; 3374 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3375 return NVPTXISD::Suld2DV2I32Zero; 3376 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3377 return NVPTXISD::Suld2DV2I64Zero; 3378 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3379 return NVPTXISD::Suld2DV4I8Zero; 3380 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3381 return NVPTXISD::Suld2DV4I16Zero; 3382 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3383 return NVPTXISD::Suld2DV4I32Zero; 3384 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3385 return NVPTXISD::Suld2DArrayI8Zero; 3386 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3387 return NVPTXISD::Suld2DArrayI16Zero; 3388 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3389 return NVPTXISD::Suld2DArrayI32Zero; 3390 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3391 return NVPTXISD::Suld2DArrayI64Zero; 3392 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3393 return NVPTXISD::Suld2DArrayV2I8Zero; 3394 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3395 return NVPTXISD::Suld2DArrayV2I16Zero; 3396 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3397 return NVPTXISD::Suld2DArrayV2I32Zero; 3398 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3399 return NVPTXISD::Suld2DArrayV2I64Zero; 3400 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3401 return NVPTXISD::Suld2DArrayV4I8Zero; 3402 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3403 return NVPTXISD::Suld2DArrayV4I16Zero; 3404 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3405 return NVPTXISD::Suld2DArrayV4I32Zero; 3406 case Intrinsic::nvvm_suld_3d_i8_zero: 3407 return NVPTXISD::Suld3DI8Zero; 3408 case Intrinsic::nvvm_suld_3d_i16_zero: 3409 return NVPTXISD::Suld3DI16Zero; 3410 case Intrinsic::nvvm_suld_3d_i32_zero: 3411 return NVPTXISD::Suld3DI32Zero; 3412 case Intrinsic::nvvm_suld_3d_i64_zero: 3413 return NVPTXISD::Suld3DI64Zero; 3414 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3415 return NVPTXISD::Suld3DV2I8Zero; 3416 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3417 return NVPTXISD::Suld3DV2I16Zero; 3418 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3419 return NVPTXISD::Suld3DV2I32Zero; 3420 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3421 return NVPTXISD::Suld3DV2I64Zero; 3422 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3423 return NVPTXISD::Suld3DV4I8Zero; 3424 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3425 return NVPTXISD::Suld3DV4I16Zero; 3426 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3427 return NVPTXISD::Suld3DV4I32Zero; 3428 } 3429 } 3430 3431 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3432 // TgtMemIntrinsic 3433 // because we need the information that is only available in the "Value" type 3434 // of destination 3435 // pointer. In particular, the address space information. 3436 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3437 IntrinsicInfo &Info, const CallInst &I, 3438 MachineFunction &MF, unsigned Intrinsic) const { 3439 switch (Intrinsic) { 3440 default: 3441 return false; 3442 case Intrinsic::nvvm_match_all_sync_i32p: 3443 case Intrinsic::nvvm_match_all_sync_i64p: 3444 Info.opc = ISD::INTRINSIC_W_CHAIN; 3445 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3446 // in order to model data exchange with other threads, but perform no real 3447 // memory accesses. 3448 Info.memVT = MVT::i1; 3449 3450 // Our result depends on both our and other thread's arguments. 3451 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3452 return true; 3453 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3454 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3455 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3456 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3457 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3458 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3459 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3460 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3461 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3462 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3463 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3464 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3465 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3466 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3467 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3468 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3469 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3470 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3471 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3472 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3473 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3474 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3475 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3476 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3477 Info.opc = ISD::INTRINSIC_W_CHAIN; 3478 Info.memVT = MVT::v8f16; 3479 Info.ptrVal = I.getArgOperand(0); 3480 Info.offset = 0; 3481 Info.flags = MachineMemOperand::MOLoad; 3482 Info.align = Align(16); 3483 return true; 3484 } 3485 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3486 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3487 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3488 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3489 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3490 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3491 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3492 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3493 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3494 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3495 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3496 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3497 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3498 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3499 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3500 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: { 3501 Info.opc = ISD::INTRINSIC_W_CHAIN; 3502 Info.memVT = MVT::v2i32; 3503 Info.ptrVal = I.getArgOperand(0); 3504 Info.offset = 0; 3505 Info.flags = MachineMemOperand::MOLoad; 3506 Info.align = Align(8); 3507 return true; 3508 } 3509 3510 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3511 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3512 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3513 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3514 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3515 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3516 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3517 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3518 3519 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3520 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3521 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3522 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3523 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3524 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3525 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3526 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: { 3527 Info.opc = ISD::INTRINSIC_W_CHAIN; 3528 Info.memVT = MVT::v4i32; 3529 Info.ptrVal = I.getArgOperand(0); 3530 Info.offset = 0; 3531 Info.flags = MachineMemOperand::MOLoad; 3532 Info.align = Align(16); 3533 return true; 3534 } 3535 3536 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3537 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3538 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3539 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3540 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3541 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3542 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3543 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3544 3545 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3546 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3547 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3548 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3549 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3550 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3551 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3552 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3553 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3554 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3555 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3556 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3557 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3558 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3559 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3560 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3561 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3562 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3563 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3564 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: { 3565 Info.opc = ISD::INTRINSIC_W_CHAIN; 3566 Info.memVT = MVT::i32; 3567 Info.ptrVal = I.getArgOperand(0); 3568 Info.offset = 0; 3569 Info.flags = MachineMemOperand::MOLoad; 3570 Info.align = Align(4); 3571 return true; 3572 } 3573 3574 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3575 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3576 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3577 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3578 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3579 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3580 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3581 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3582 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3583 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3584 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3585 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3586 Info.opc = ISD::INTRINSIC_W_CHAIN; 3587 Info.memVT = MVT::v4f16; 3588 Info.ptrVal = I.getArgOperand(0); 3589 Info.offset = 0; 3590 Info.flags = MachineMemOperand::MOLoad; 3591 Info.align = Align(16); 3592 return true; 3593 } 3594 3595 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3596 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3597 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3598 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3599 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3600 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3601 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3602 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3603 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3604 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3605 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3606 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: { 3607 Info.opc = ISD::INTRINSIC_W_CHAIN; 3608 Info.memVT = MVT::v8f32; 3609 Info.ptrVal = I.getArgOperand(0); 3610 Info.offset = 0; 3611 Info.flags = MachineMemOperand::MOLoad; 3612 Info.align = Align(16); 3613 return true; 3614 } 3615 3616 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3617 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3618 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3619 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3620 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3621 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3622 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3623 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3624 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3625 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3626 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3627 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3628 Info.opc = ISD::INTRINSIC_W_CHAIN; 3629 Info.memVT = MVT::v8i32; 3630 Info.ptrVal = I.getArgOperand(0); 3631 Info.offset = 0; 3632 Info.flags = MachineMemOperand::MOLoad; 3633 Info.align = Align(16); 3634 return true; 3635 } 3636 3637 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3638 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3639 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3640 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3641 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3642 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3643 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3644 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: { 3645 Info.opc = ISD::INTRINSIC_W_CHAIN; 3646 Info.memVT = MVT::v2i32; 3647 Info.ptrVal = I.getArgOperand(0); 3648 Info.offset = 0; 3649 Info.flags = MachineMemOperand::MOLoad; 3650 Info.align = Align(8); 3651 return true; 3652 } 3653 3654 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3655 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3656 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3657 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3658 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3659 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3660 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3661 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3662 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3663 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3664 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3665 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3666 Info.opc = ISD::INTRINSIC_VOID; 3667 Info.memVT = MVT::v4f16; 3668 Info.ptrVal = I.getArgOperand(0); 3669 Info.offset = 0; 3670 Info.flags = MachineMemOperand::MOStore; 3671 Info.align = Align(16); 3672 return true; 3673 } 3674 3675 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3676 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3677 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3678 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3679 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3680 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3681 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3682 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3683 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3684 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3685 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3686 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: { 3687 Info.opc = ISD::INTRINSIC_VOID; 3688 Info.memVT = MVT::v8f32; 3689 Info.ptrVal = I.getArgOperand(0); 3690 Info.offset = 0; 3691 Info.flags = MachineMemOperand::MOStore; 3692 Info.align = Align(16); 3693 return true; 3694 } 3695 3696 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3697 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3698 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3699 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3700 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3701 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3702 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3703 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3704 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3705 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3706 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3707 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3708 Info.opc = ISD::INTRINSIC_VOID; 3709 Info.memVT = MVT::v8i32; 3710 Info.ptrVal = I.getArgOperand(0); 3711 Info.offset = 0; 3712 Info.flags = MachineMemOperand::MOStore; 3713 Info.align = Align(16); 3714 return true; 3715 } 3716 3717 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3718 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3719 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3720 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3721 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3722 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3723 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3724 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3725 Info.opc = ISD::INTRINSIC_VOID; 3726 Info.memVT = MVT::v2i32; 3727 Info.ptrVal = I.getArgOperand(0); 3728 Info.offset = 0; 3729 Info.flags = MachineMemOperand::MOStore; 3730 Info.align = Align(8); 3731 return true; 3732 } 3733 3734 case Intrinsic::nvvm_atomic_load_inc_32: 3735 case Intrinsic::nvvm_atomic_load_dec_32: 3736 3737 case Intrinsic::nvvm_atomic_add_gen_f_cta: 3738 case Intrinsic::nvvm_atomic_add_gen_f_sys: 3739 case Intrinsic::nvvm_atomic_add_gen_i_cta: 3740 case Intrinsic::nvvm_atomic_add_gen_i_sys: 3741 case Intrinsic::nvvm_atomic_and_gen_i_cta: 3742 case Intrinsic::nvvm_atomic_and_gen_i_sys: 3743 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 3744 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 3745 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 3746 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 3747 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 3748 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 3749 case Intrinsic::nvvm_atomic_max_gen_i_cta: 3750 case Intrinsic::nvvm_atomic_max_gen_i_sys: 3751 case Intrinsic::nvvm_atomic_min_gen_i_cta: 3752 case Intrinsic::nvvm_atomic_min_gen_i_sys: 3753 case Intrinsic::nvvm_atomic_or_gen_i_cta: 3754 case Intrinsic::nvvm_atomic_or_gen_i_sys: 3755 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 3756 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 3757 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 3758 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 3759 auto &DL = I.getModule()->getDataLayout(); 3760 Info.opc = ISD::INTRINSIC_W_CHAIN; 3761 Info.memVT = getValueType(DL, I.getType()); 3762 Info.ptrVal = I.getArgOperand(0); 3763 Info.offset = 0; 3764 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3765 Info.align.reset(); 3766 return true; 3767 } 3768 3769 case Intrinsic::nvvm_ldu_global_i: 3770 case Intrinsic::nvvm_ldu_global_f: 3771 case Intrinsic::nvvm_ldu_global_p: { 3772 auto &DL = I.getModule()->getDataLayout(); 3773 Info.opc = ISD::INTRINSIC_W_CHAIN; 3774 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 3775 Info.memVT = getValueType(DL, I.getType()); 3776 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 3777 Info.memVT = getPointerTy(DL); 3778 else 3779 Info.memVT = getValueType(DL, I.getType()); 3780 Info.ptrVal = I.getArgOperand(0); 3781 Info.offset = 0; 3782 Info.flags = MachineMemOperand::MOLoad; 3783 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3784 3785 return true; 3786 } 3787 case Intrinsic::nvvm_ldg_global_i: 3788 case Intrinsic::nvvm_ldg_global_f: 3789 case Intrinsic::nvvm_ldg_global_p: { 3790 auto &DL = I.getModule()->getDataLayout(); 3791 3792 Info.opc = ISD::INTRINSIC_W_CHAIN; 3793 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 3794 Info.memVT = getValueType(DL, I.getType()); 3795 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 3796 Info.memVT = getPointerTy(DL); 3797 else 3798 Info.memVT = getValueType(DL, I.getType()); 3799 Info.ptrVal = I.getArgOperand(0); 3800 Info.offset = 0; 3801 Info.flags = MachineMemOperand::MOLoad; 3802 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3803 3804 return true; 3805 } 3806 3807 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3808 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3809 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3810 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3811 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3812 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3813 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3814 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3815 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3816 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3817 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3818 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3819 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3820 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3821 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3822 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3823 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3824 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3825 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3826 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3827 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3828 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3829 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3830 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3831 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3832 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3833 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3834 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3835 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3836 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3837 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3838 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3839 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3840 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3841 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3842 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3843 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3844 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3845 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3846 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3847 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3848 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3849 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3850 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3851 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3852 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3853 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3854 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3855 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3856 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3857 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3858 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3859 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3860 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3861 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3862 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3863 Info.opc = getOpcForTextureInstr(Intrinsic); 3864 Info.memVT = MVT::v4f32; 3865 Info.ptrVal = nullptr; 3866 Info.offset = 0; 3867 Info.flags = MachineMemOperand::MOLoad; 3868 Info.align = Align(16); 3869 return true; 3870 3871 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3872 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3873 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3874 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3875 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3876 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3877 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3878 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3879 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3880 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3881 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3882 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3883 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3884 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3885 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3886 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3887 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3888 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3889 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3890 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3891 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3892 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3893 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3894 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3895 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3896 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3897 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3898 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3899 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3900 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3901 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3902 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3903 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3904 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3905 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3906 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3907 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3908 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3909 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3910 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3911 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3912 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3913 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3914 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3915 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3916 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3917 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3918 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3919 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3920 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3921 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3922 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3923 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3924 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3925 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3926 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3927 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3928 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3929 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3930 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3931 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3932 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3933 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3934 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3935 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3936 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3937 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3938 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3939 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3940 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3941 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3942 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3943 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3944 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3945 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3946 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3947 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3948 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3949 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3950 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3951 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3952 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3953 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3954 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3955 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3956 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3957 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3958 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3959 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3960 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3961 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3962 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3963 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3964 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3965 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3966 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3967 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3968 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3969 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3970 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3971 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3972 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3973 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3974 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3975 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3976 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3977 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3978 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3979 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3980 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3981 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3982 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3983 Info.opc = getOpcForTextureInstr(Intrinsic); 3984 Info.memVT = MVT::v4i32; 3985 Info.ptrVal = nullptr; 3986 Info.offset = 0; 3987 Info.flags = MachineMemOperand::MOLoad; 3988 Info.align = Align(16); 3989 return true; 3990 3991 case Intrinsic::nvvm_suld_1d_i8_clamp: 3992 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3993 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3994 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3995 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3996 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3997 case Intrinsic::nvvm_suld_2d_i8_clamp: 3998 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3999 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4000 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4001 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4002 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4003 case Intrinsic::nvvm_suld_3d_i8_clamp: 4004 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4005 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4006 case Intrinsic::nvvm_suld_1d_i8_trap: 4007 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4008 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4009 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4010 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4011 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4012 case Intrinsic::nvvm_suld_2d_i8_trap: 4013 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4014 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4015 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4016 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4017 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4018 case Intrinsic::nvvm_suld_3d_i8_trap: 4019 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4020 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4021 case Intrinsic::nvvm_suld_1d_i8_zero: 4022 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4023 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4024 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4025 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4026 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4027 case Intrinsic::nvvm_suld_2d_i8_zero: 4028 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4029 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4030 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4031 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4032 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4033 case Intrinsic::nvvm_suld_3d_i8_zero: 4034 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4035 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4036 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4037 Info.memVT = MVT::i8; 4038 Info.ptrVal = nullptr; 4039 Info.offset = 0; 4040 Info.flags = MachineMemOperand::MOLoad; 4041 Info.align = Align(16); 4042 return true; 4043 4044 case Intrinsic::nvvm_suld_1d_i16_clamp: 4045 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4046 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4047 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4048 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4049 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4050 case Intrinsic::nvvm_suld_2d_i16_clamp: 4051 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4052 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4053 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4054 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4055 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4056 case Intrinsic::nvvm_suld_3d_i16_clamp: 4057 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4058 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4059 case Intrinsic::nvvm_suld_1d_i16_trap: 4060 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4061 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4062 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4063 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4064 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4065 case Intrinsic::nvvm_suld_2d_i16_trap: 4066 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4067 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4068 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4069 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4070 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4071 case Intrinsic::nvvm_suld_3d_i16_trap: 4072 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4073 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4074 case Intrinsic::nvvm_suld_1d_i16_zero: 4075 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4076 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4077 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4078 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4079 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4080 case Intrinsic::nvvm_suld_2d_i16_zero: 4081 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4082 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4083 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4084 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4085 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4086 case Intrinsic::nvvm_suld_3d_i16_zero: 4087 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4088 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4089 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4090 Info.memVT = MVT::i16; 4091 Info.ptrVal = nullptr; 4092 Info.offset = 0; 4093 Info.flags = MachineMemOperand::MOLoad; 4094 Info.align = Align(16); 4095 return true; 4096 4097 case Intrinsic::nvvm_suld_1d_i32_clamp: 4098 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4099 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4100 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4101 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4102 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4103 case Intrinsic::nvvm_suld_2d_i32_clamp: 4104 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4105 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4106 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4107 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4108 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4109 case Intrinsic::nvvm_suld_3d_i32_clamp: 4110 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4111 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4112 case Intrinsic::nvvm_suld_1d_i32_trap: 4113 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4114 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4115 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4116 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4117 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4118 case Intrinsic::nvvm_suld_2d_i32_trap: 4119 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4120 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4121 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4122 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4123 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4124 case Intrinsic::nvvm_suld_3d_i32_trap: 4125 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4126 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4127 case Intrinsic::nvvm_suld_1d_i32_zero: 4128 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4129 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4130 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4131 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4132 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4133 case Intrinsic::nvvm_suld_2d_i32_zero: 4134 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4135 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4136 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4137 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4138 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4139 case Intrinsic::nvvm_suld_3d_i32_zero: 4140 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4141 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4142 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4143 Info.memVT = MVT::i32; 4144 Info.ptrVal = nullptr; 4145 Info.offset = 0; 4146 Info.flags = MachineMemOperand::MOLoad; 4147 Info.align = Align(16); 4148 return true; 4149 4150 case Intrinsic::nvvm_suld_1d_i64_clamp: 4151 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4152 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4153 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4154 case Intrinsic::nvvm_suld_2d_i64_clamp: 4155 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4156 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4157 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4158 case Intrinsic::nvvm_suld_3d_i64_clamp: 4159 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4160 case Intrinsic::nvvm_suld_1d_i64_trap: 4161 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4162 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4163 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4164 case Intrinsic::nvvm_suld_2d_i64_trap: 4165 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4166 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4167 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4168 case Intrinsic::nvvm_suld_3d_i64_trap: 4169 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4170 case Intrinsic::nvvm_suld_1d_i64_zero: 4171 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4172 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4173 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4174 case Intrinsic::nvvm_suld_2d_i64_zero: 4175 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4176 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4177 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4178 case Intrinsic::nvvm_suld_3d_i64_zero: 4179 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4180 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4181 Info.memVT = MVT::i64; 4182 Info.ptrVal = nullptr; 4183 Info.offset = 0; 4184 Info.flags = MachineMemOperand::MOLoad; 4185 Info.align = Align(16); 4186 return true; 4187 } 4188 return false; 4189 } 4190 4191 /// isLegalAddressingMode - Return true if the addressing mode represented 4192 /// by AM is legal for this target, for a load/store of the specified type. 4193 /// Used to guide target specific optimizations, like loop strength reduction 4194 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4195 /// (CodeGenPrepare.cpp) 4196 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4197 const AddrMode &AM, Type *Ty, 4198 unsigned AS, Instruction *I) const { 4199 // AddrMode - This represents an addressing mode of: 4200 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4201 // 4202 // The legal address modes are 4203 // - [avar] 4204 // - [areg] 4205 // - [areg+immoff] 4206 // - [immAddr] 4207 4208 if (AM.BaseGV) { 4209 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4210 } 4211 4212 switch (AM.Scale) { 4213 case 0: // "r", "r+i" or "i" is allowed 4214 break; 4215 case 1: 4216 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4217 return false; 4218 // Otherwise we have r+i. 4219 break; 4220 default: 4221 // No scale > 1 is allowed 4222 return false; 4223 } 4224 return true; 4225 } 4226 4227 //===----------------------------------------------------------------------===// 4228 // NVPTX Inline Assembly Support 4229 //===----------------------------------------------------------------------===// 4230 4231 /// getConstraintType - Given a constraint letter, return the type of 4232 /// constraint it is for this target. 4233 NVPTXTargetLowering::ConstraintType 4234 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4235 if (Constraint.size() == 1) { 4236 switch (Constraint[0]) { 4237 default: 4238 break; 4239 case 'b': 4240 case 'r': 4241 case 'h': 4242 case 'c': 4243 case 'l': 4244 case 'f': 4245 case 'd': 4246 case '0': 4247 case 'N': 4248 return C_RegisterClass; 4249 } 4250 } 4251 return TargetLowering::getConstraintType(Constraint); 4252 } 4253 4254 std::pair<unsigned, const TargetRegisterClass *> 4255 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4256 StringRef Constraint, 4257 MVT VT) const { 4258 if (Constraint.size() == 1) { 4259 switch (Constraint[0]) { 4260 case 'b': 4261 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4262 case 'c': 4263 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4264 case 'h': 4265 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4266 case 'r': 4267 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4268 case 'l': 4269 case 'N': 4270 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4271 case 'f': 4272 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4273 case 'd': 4274 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4275 } 4276 } 4277 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4278 } 4279 4280 //===----------------------------------------------------------------------===// 4281 // NVPTX DAG Combining 4282 //===----------------------------------------------------------------------===// 4283 4284 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4285 CodeGenOpt::Level OptLevel) const { 4286 // Always honor command-line argument 4287 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4288 return FMAContractLevelOpt > 0; 4289 4290 // Do not contract if we're not optimizing the code. 4291 if (OptLevel == 0) 4292 return false; 4293 4294 // Honor TargetOptions flags that explicitly say fusion is okay. 4295 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4296 return true; 4297 4298 return allowUnsafeFPMath(MF); 4299 } 4300 4301 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4302 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4303 if (MF.getTarget().Options.UnsafeFPMath) 4304 return true; 4305 4306 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4307 const Function &F = MF.getFunction(); 4308 if (F.hasFnAttribute("unsafe-fp-math")) { 4309 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 4310 StringRef Val = Attr.getValueAsString(); 4311 if (Val == "true") 4312 return true; 4313 } 4314 4315 return false; 4316 } 4317 4318 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4319 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4320 /// called with the default operands, and if that fails, with commuted 4321 /// operands. 4322 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4323 TargetLowering::DAGCombinerInfo &DCI, 4324 const NVPTXSubtarget &Subtarget, 4325 CodeGenOpt::Level OptLevel) { 4326 SelectionDAG &DAG = DCI.DAG; 4327 // Skip non-integer, non-scalar case 4328 EVT VT=N0.getValueType(); 4329 if (VT.isVector()) 4330 return SDValue(); 4331 4332 // fold (add (mul a, b), c) -> (mad a, b, c) 4333 // 4334 if (N0.getOpcode() == ISD::MUL) { 4335 assert (VT.isInteger()); 4336 // For integer: 4337 // Since integer multiply-add costs the same as integer multiply 4338 // but is more costly than integer add, do the fusion only when 4339 // the mul is only used in the add. 4340 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4341 !N0.getNode()->hasOneUse()) 4342 return SDValue(); 4343 4344 // Do the folding 4345 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4346 N0.getOperand(0), N0.getOperand(1), N1); 4347 } 4348 else if (N0.getOpcode() == ISD::FMUL) { 4349 if (VT == MVT::f32 || VT == MVT::f64) { 4350 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4351 &DAG.getTargetLoweringInfo()); 4352 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4353 return SDValue(); 4354 4355 // For floating point: 4356 // Do the fusion only when the mul has less than 5 uses and all 4357 // are add. 4358 // The heuristic is that if a use is not an add, then that use 4359 // cannot be fused into fma, therefore mul is still needed anyway. 4360 // If there are more than 4 uses, even if they are all add, fusing 4361 // them will increase register pressue. 4362 // 4363 int numUses = 0; 4364 int nonAddCount = 0; 4365 for (SDNode::use_iterator UI = N0.getNode()->use_begin(), 4366 UE = N0.getNode()->use_end(); 4367 UI != UE; ++UI) { 4368 numUses++; 4369 SDNode *User = *UI; 4370 if (User->getOpcode() != ISD::FADD) 4371 ++nonAddCount; 4372 } 4373 if (numUses >= 5) 4374 return SDValue(); 4375 if (nonAddCount) { 4376 int orderNo = N->getIROrder(); 4377 int orderNo2 = N0.getNode()->getIROrder(); 4378 // simple heuristics here for considering potential register 4379 // pressure, the logics here is that the differnce are used 4380 // to measure the distance between def and use, the longer distance 4381 // more likely cause register pressure. 4382 if (orderNo - orderNo2 < 500) 4383 return SDValue(); 4384 4385 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4386 // which guarantees that the FMA will not increase register pressure at node N. 4387 bool opIsLive = false; 4388 const SDNode *left = N0.getOperand(0).getNode(); 4389 const SDNode *right = N0.getOperand(1).getNode(); 4390 4391 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4392 opIsLive = true; 4393 4394 if (!opIsLive) 4395 for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) { 4396 SDNode *User = *UI; 4397 int orderNo3 = User->getIROrder(); 4398 if (orderNo3 > orderNo) { 4399 opIsLive = true; 4400 break; 4401 } 4402 } 4403 4404 if (!opIsLive) 4405 for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) { 4406 SDNode *User = *UI; 4407 int orderNo3 = User->getIROrder(); 4408 if (orderNo3 > orderNo) { 4409 opIsLive = true; 4410 break; 4411 } 4412 } 4413 4414 if (!opIsLive) 4415 return SDValue(); 4416 } 4417 4418 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4419 N0.getOperand(0), N0.getOperand(1), N1); 4420 } 4421 } 4422 4423 return SDValue(); 4424 } 4425 4426 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4427 /// 4428 static SDValue PerformADDCombine(SDNode *N, 4429 TargetLowering::DAGCombinerInfo &DCI, 4430 const NVPTXSubtarget &Subtarget, 4431 CodeGenOpt::Level OptLevel) { 4432 SDValue N0 = N->getOperand(0); 4433 SDValue N1 = N->getOperand(1); 4434 4435 // First try with the default operand order. 4436 if (SDValue Result = 4437 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4438 return Result; 4439 4440 // If that didn't work, try again with the operands commuted. 4441 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4442 } 4443 4444 static SDValue PerformANDCombine(SDNode *N, 4445 TargetLowering::DAGCombinerInfo &DCI) { 4446 // The type legalizer turns a vector load of i8 values into a zextload to i16 4447 // registers, optionally ANY_EXTENDs it (if target type is integer), 4448 // and ANDs off the high 8 bits. Since we turn this load into a 4449 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4450 // nodes. Do that here. 4451 SDValue Val = N->getOperand(0); 4452 SDValue Mask = N->getOperand(1); 4453 4454 if (isa<ConstantSDNode>(Val)) { 4455 std::swap(Val, Mask); 4456 } 4457 4458 SDValue AExt; 4459 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4460 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4461 AExt = Val; 4462 Val = Val->getOperand(0); 4463 } 4464 4465 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4466 Val = Val->getOperand(0); 4467 } 4468 4469 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4470 Val->getOpcode() == NVPTXISD::LoadV4) { 4471 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4472 if (!MaskCnst) { 4473 // Not an AND with a constant 4474 return SDValue(); 4475 } 4476 4477 uint64_t MaskVal = MaskCnst->getZExtValue(); 4478 if (MaskVal != 0xff) { 4479 // Not an AND that chops off top 8 bits 4480 return SDValue(); 4481 } 4482 4483 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4484 if (!Mem) { 4485 // Not a MemSDNode?!? 4486 return SDValue(); 4487 } 4488 4489 EVT MemVT = Mem->getMemoryVT(); 4490 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4491 // We only handle the i8 case 4492 return SDValue(); 4493 } 4494 4495 unsigned ExtType = 4496 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4497 getZExtValue(); 4498 if (ExtType == ISD::SEXTLOAD) { 4499 // If for some reason the load is a sextload, the and is needed to zero 4500 // out the high 8 bits 4501 return SDValue(); 4502 } 4503 4504 bool AddTo = false; 4505 if (AExt.getNode() != nullptr) { 4506 // Re-insert the ext as a zext. 4507 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4508 AExt.getValueType(), Val); 4509 AddTo = true; 4510 } 4511 4512 // If we get here, the AND is unnecessary. Just replace it with the load 4513 DCI.CombineTo(N, Val, AddTo); 4514 } 4515 4516 return SDValue(); 4517 } 4518 4519 static SDValue PerformREMCombine(SDNode *N, 4520 TargetLowering::DAGCombinerInfo &DCI, 4521 CodeGenOpt::Level OptLevel) { 4522 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4523 4524 // Don't do anything at less than -O2. 4525 if (OptLevel < CodeGenOpt::Default) 4526 return SDValue(); 4527 4528 SelectionDAG &DAG = DCI.DAG; 4529 SDLoc DL(N); 4530 EVT VT = N->getValueType(0); 4531 bool IsSigned = N->getOpcode() == ISD::SREM; 4532 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4533 4534 const SDValue &Num = N->getOperand(0); 4535 const SDValue &Den = N->getOperand(1); 4536 4537 for (const SDNode *U : Num->uses()) { 4538 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4539 U->getOperand(1) == Den) { 4540 // Num % Den -> Num - (Num / Den) * Den 4541 return DAG.getNode(ISD::SUB, DL, VT, Num, 4542 DAG.getNode(ISD::MUL, DL, VT, 4543 DAG.getNode(DivOpc, DL, VT, Num, Den), 4544 Den)); 4545 } 4546 } 4547 return SDValue(); 4548 } 4549 4550 enum OperandSignedness { 4551 Signed = 0, 4552 Unsigned, 4553 Unknown 4554 }; 4555 4556 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4557 /// that can be demoted to \p OptSize bits without loss of information. The 4558 /// signedness of the operand, if determinable, is placed in \p S. 4559 static bool IsMulWideOperandDemotable(SDValue Op, 4560 unsigned OptSize, 4561 OperandSignedness &S) { 4562 S = Unknown; 4563 4564 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4565 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4566 EVT OrigVT = Op.getOperand(0).getValueType(); 4567 if (OrigVT.getSizeInBits() <= OptSize) { 4568 S = Signed; 4569 return true; 4570 } 4571 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4572 EVT OrigVT = Op.getOperand(0).getValueType(); 4573 if (OrigVT.getSizeInBits() <= OptSize) { 4574 S = Unsigned; 4575 return true; 4576 } 4577 } 4578 4579 return false; 4580 } 4581 4582 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4583 /// be demoted to \p OptSize bits without loss of information. If the operands 4584 /// contain a constant, it should appear as the RHS operand. The signedness of 4585 /// the operands is placed in \p IsSigned. 4586 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4587 unsigned OptSize, 4588 bool &IsSigned) { 4589 OperandSignedness LHSSign; 4590 4591 // The LHS operand must be a demotable op 4592 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4593 return false; 4594 4595 // We should have been able to determine the signedness from the LHS 4596 if (LHSSign == Unknown) 4597 return false; 4598 4599 IsSigned = (LHSSign == Signed); 4600 4601 // The RHS can be a demotable op or a constant 4602 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4603 const APInt &Val = CI->getAPIntValue(); 4604 if (LHSSign == Unsigned) { 4605 return Val.isIntN(OptSize); 4606 } else { 4607 return Val.isSignedIntN(OptSize); 4608 } 4609 } else { 4610 OperandSignedness RHSSign; 4611 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4612 return false; 4613 4614 return LHSSign == RHSSign; 4615 } 4616 } 4617 4618 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4619 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4620 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4621 /// amount. 4622 static SDValue TryMULWIDECombine(SDNode *N, 4623 TargetLowering::DAGCombinerInfo &DCI) { 4624 EVT MulType = N->getValueType(0); 4625 if (MulType != MVT::i32 && MulType != MVT::i64) { 4626 return SDValue(); 4627 } 4628 4629 SDLoc DL(N); 4630 unsigned OptSize = MulType.getSizeInBits() >> 1; 4631 SDValue LHS = N->getOperand(0); 4632 SDValue RHS = N->getOperand(1); 4633 4634 // Canonicalize the multiply so the constant (if any) is on the right 4635 if (N->getOpcode() == ISD::MUL) { 4636 if (isa<ConstantSDNode>(LHS)) { 4637 std::swap(LHS, RHS); 4638 } 4639 } 4640 4641 // If we have a SHL, determine the actual multiply amount 4642 if (N->getOpcode() == ISD::SHL) { 4643 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4644 if (!ShlRHS) { 4645 return SDValue(); 4646 } 4647 4648 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4649 unsigned BitWidth = MulType.getSizeInBits(); 4650 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4651 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4652 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4653 } else { 4654 return SDValue(); 4655 } 4656 } 4657 4658 bool Signed; 4659 // Verify that our operands are demotable 4660 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4661 return SDValue(); 4662 } 4663 4664 EVT DemotedVT; 4665 if (MulType == MVT::i32) { 4666 DemotedVT = MVT::i16; 4667 } else { 4668 DemotedVT = MVT::i32; 4669 } 4670 4671 // Truncate the operands to the correct size. Note that these are just for 4672 // type consistency and will (likely) be eliminated in later phases. 4673 SDValue TruncLHS = 4674 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 4675 SDValue TruncRHS = 4676 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 4677 4678 unsigned Opc; 4679 if (Signed) { 4680 Opc = NVPTXISD::MUL_WIDE_SIGNED; 4681 } else { 4682 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 4683 } 4684 4685 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 4686 } 4687 4688 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 4689 static SDValue PerformMULCombine(SDNode *N, 4690 TargetLowering::DAGCombinerInfo &DCI, 4691 CodeGenOpt::Level OptLevel) { 4692 if (OptLevel > 0) { 4693 // Try mul.wide combining at OptLevel > 0 4694 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4695 return Ret; 4696 } 4697 4698 return SDValue(); 4699 } 4700 4701 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 4702 static SDValue PerformSHLCombine(SDNode *N, 4703 TargetLowering::DAGCombinerInfo &DCI, 4704 CodeGenOpt::Level OptLevel) { 4705 if (OptLevel > 0) { 4706 // Try mul.wide combining at OptLevel > 0 4707 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4708 return Ret; 4709 } 4710 4711 return SDValue(); 4712 } 4713 4714 static SDValue PerformSETCCCombine(SDNode *N, 4715 TargetLowering::DAGCombinerInfo &DCI) { 4716 EVT CCType = N->getValueType(0); 4717 SDValue A = N->getOperand(0); 4718 SDValue B = N->getOperand(1); 4719 4720 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 4721 return SDValue(); 4722 4723 SDLoc DL(N); 4724 // setp.f16x2 returns two scalar predicates, which we need to 4725 // convert back to v2i1. The returned result will be scalarized by 4726 // the legalizer, but the comparison will remain a single vector 4727 // instruction. 4728 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 4729 DCI.DAG.getVTList(MVT::i1, MVT::i1), 4730 {A, B, N->getOperand(2)}); 4731 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 4732 CCNode.getValue(1)); 4733 } 4734 4735 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 4736 DAGCombinerInfo &DCI) const { 4737 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 4738 switch (N->getOpcode()) { 4739 default: break; 4740 case ISD::ADD: 4741 case ISD::FADD: 4742 return PerformADDCombine(N, DCI, STI, OptLevel); 4743 case ISD::MUL: 4744 return PerformMULCombine(N, DCI, OptLevel); 4745 case ISD::SHL: 4746 return PerformSHLCombine(N, DCI, OptLevel); 4747 case ISD::AND: 4748 return PerformANDCombine(N, DCI); 4749 case ISD::UREM: 4750 case ISD::SREM: 4751 return PerformREMCombine(N, DCI, OptLevel); 4752 case ISD::SETCC: 4753 return PerformSETCCCombine(N, DCI); 4754 } 4755 return SDValue(); 4756 } 4757 4758 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 4759 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 4760 SmallVectorImpl<SDValue> &Results) { 4761 EVT ResVT = N->getValueType(0); 4762 SDLoc DL(N); 4763 4764 assert(ResVT.isVector() && "Vector load must have vector type"); 4765 4766 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 4767 // legal. We can (and should) split that into 2 loads of <2 x double> here 4768 // but I'm leaving that as a TODO for now. 4769 assert(ResVT.isSimple() && "Can only handle simple types"); 4770 switch (ResVT.getSimpleVT().SimpleTy) { 4771 default: 4772 return; 4773 case MVT::v2i8: 4774 case MVT::v2i16: 4775 case MVT::v2i32: 4776 case MVT::v2i64: 4777 case MVT::v2f16: 4778 case MVT::v2f32: 4779 case MVT::v2f64: 4780 case MVT::v4i8: 4781 case MVT::v4i16: 4782 case MVT::v4i32: 4783 case MVT::v4f16: 4784 case MVT::v4f32: 4785 case MVT::v8f16: // <4 x f16x2> 4786 // This is a "native" vector type 4787 break; 4788 } 4789 4790 LoadSDNode *LD = cast<LoadSDNode>(N); 4791 4792 Align Alignment = LD->getAlign(); 4793 auto &TD = DAG.getDataLayout(); 4794 Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext())); 4795 if (Alignment < PrefAlign) { 4796 // This load is not sufficiently aligned, so bail out and let this vector 4797 // load be scalarized. Note that we may still be able to emit smaller 4798 // vector loads. For example, if we are loading a <4 x float> with an 4799 // alignment of 8, this check will fail but the legalizer will try again 4800 // with 2 x <2 x float>, which will succeed with an alignment of 8. 4801 return; 4802 } 4803 4804 EVT EltVT = ResVT.getVectorElementType(); 4805 unsigned NumElts = ResVT.getVectorNumElements(); 4806 4807 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 4808 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4809 // loaded type to i16 and propagate the "real" type as the memory type. 4810 bool NeedTrunc = false; 4811 if (EltVT.getSizeInBits() < 16) { 4812 EltVT = MVT::i16; 4813 NeedTrunc = true; 4814 } 4815 4816 unsigned Opcode = 0; 4817 SDVTList LdResVTs; 4818 bool LoadF16x2 = false; 4819 4820 switch (NumElts) { 4821 default: 4822 return; 4823 case 2: 4824 Opcode = NVPTXISD::LoadV2; 4825 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4826 break; 4827 case 4: { 4828 Opcode = NVPTXISD::LoadV4; 4829 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4830 LdResVTs = DAG.getVTList(ListVTs); 4831 break; 4832 } 4833 case 8: { 4834 // v8f16 is a special case. PTX doesn't have ld.v8.f16 4835 // instruction. Instead, we split the vector into v2f16 chunks and 4836 // load them with ld.v4.b32. 4837 assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); 4838 LoadF16x2 = true; 4839 Opcode = NVPTXISD::LoadV4; 4840 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, 4841 MVT::Other}; 4842 LdResVTs = DAG.getVTList(ListVTs); 4843 break; 4844 } 4845 } 4846 4847 // Copy regular operands 4848 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 4849 4850 // The select routine does not have access to the LoadSDNode instance, so 4851 // pass along the extension information 4852 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 4853 4854 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 4855 LD->getMemoryVT(), 4856 LD->getMemOperand()); 4857 4858 SmallVector<SDValue, 8> ScalarRes; 4859 if (LoadF16x2) { 4860 // Split v2f16 subvectors back into individual elements. 4861 NumElts /= 2; 4862 for (unsigned i = 0; i < NumElts; ++i) { 4863 SDValue SubVector = NewLD.getValue(i); 4864 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4865 DAG.getIntPtrConstant(0, DL)); 4866 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 4867 DAG.getIntPtrConstant(1, DL)); 4868 ScalarRes.push_back(E0); 4869 ScalarRes.push_back(E1); 4870 } 4871 } else { 4872 for (unsigned i = 0; i < NumElts; ++i) { 4873 SDValue Res = NewLD.getValue(i); 4874 if (NeedTrunc) 4875 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 4876 ScalarRes.push_back(Res); 4877 } 4878 } 4879 4880 SDValue LoadChain = NewLD.getValue(NumElts); 4881 4882 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 4883 4884 Results.push_back(BuildVec); 4885 Results.push_back(LoadChain); 4886 } 4887 4888 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 4889 SmallVectorImpl<SDValue> &Results) { 4890 SDValue Chain = N->getOperand(0); 4891 SDValue Intrin = N->getOperand(1); 4892 SDLoc DL(N); 4893 4894 // Get the intrinsic ID 4895 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 4896 switch (IntrinNo) { 4897 default: 4898 return; 4899 case Intrinsic::nvvm_ldg_global_i: 4900 case Intrinsic::nvvm_ldg_global_f: 4901 case Intrinsic::nvvm_ldg_global_p: 4902 case Intrinsic::nvvm_ldu_global_i: 4903 case Intrinsic::nvvm_ldu_global_f: 4904 case Intrinsic::nvvm_ldu_global_p: { 4905 EVT ResVT = N->getValueType(0); 4906 4907 if (ResVT.isVector()) { 4908 // Vector LDG/LDU 4909 4910 unsigned NumElts = ResVT.getVectorNumElements(); 4911 EVT EltVT = ResVT.getVectorElementType(); 4912 4913 // Since LDU/LDG are target nodes, we cannot rely on DAG type 4914 // legalization. 4915 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4916 // loaded type to i16 and propagate the "real" type as the memory type. 4917 bool NeedTrunc = false; 4918 if (EltVT.getSizeInBits() < 16) { 4919 EltVT = MVT::i16; 4920 NeedTrunc = true; 4921 } 4922 4923 unsigned Opcode = 0; 4924 SDVTList LdResVTs; 4925 4926 switch (NumElts) { 4927 default: 4928 return; 4929 case 2: 4930 switch (IntrinNo) { 4931 default: 4932 return; 4933 case Intrinsic::nvvm_ldg_global_i: 4934 case Intrinsic::nvvm_ldg_global_f: 4935 case Intrinsic::nvvm_ldg_global_p: 4936 Opcode = NVPTXISD::LDGV2; 4937 break; 4938 case Intrinsic::nvvm_ldu_global_i: 4939 case Intrinsic::nvvm_ldu_global_f: 4940 case Intrinsic::nvvm_ldu_global_p: 4941 Opcode = NVPTXISD::LDUV2; 4942 break; 4943 } 4944 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4945 break; 4946 case 4: { 4947 switch (IntrinNo) { 4948 default: 4949 return; 4950 case Intrinsic::nvvm_ldg_global_i: 4951 case Intrinsic::nvvm_ldg_global_f: 4952 case Intrinsic::nvvm_ldg_global_p: 4953 Opcode = NVPTXISD::LDGV4; 4954 break; 4955 case Intrinsic::nvvm_ldu_global_i: 4956 case Intrinsic::nvvm_ldu_global_f: 4957 case Intrinsic::nvvm_ldu_global_p: 4958 Opcode = NVPTXISD::LDUV4; 4959 break; 4960 } 4961 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4962 LdResVTs = DAG.getVTList(ListVTs); 4963 break; 4964 } 4965 } 4966 4967 SmallVector<SDValue, 8> OtherOps; 4968 4969 // Copy regular operands 4970 4971 OtherOps.push_back(Chain); // Chain 4972 // Skip operand 1 (intrinsic ID) 4973 // Others 4974 OtherOps.append(N->op_begin() + 2, N->op_end()); 4975 4976 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 4977 4978 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 4979 MemSD->getMemoryVT(), 4980 MemSD->getMemOperand()); 4981 4982 SmallVector<SDValue, 4> ScalarRes; 4983 4984 for (unsigned i = 0; i < NumElts; ++i) { 4985 SDValue Res = NewLD.getValue(i); 4986 if (NeedTrunc) 4987 Res = 4988 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 4989 ScalarRes.push_back(Res); 4990 } 4991 4992 SDValue LoadChain = NewLD.getValue(NumElts); 4993 4994 SDValue BuildVec = 4995 DAG.getBuildVector(ResVT, DL, ScalarRes); 4996 4997 Results.push_back(BuildVec); 4998 Results.push_back(LoadChain); 4999 } else { 5000 // i8 LDG/LDU 5001 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5002 "Custom handling of non-i8 ldu/ldg?"); 5003 5004 // Just copy all operands as-is 5005 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5006 5007 // Force output to i16 5008 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5009 5010 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5011 5012 // We make sure the memory type is i8, which will be used during isel 5013 // to select the proper instruction. 5014 SDValue NewLD = 5015 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5016 MVT::i8, MemSD->getMemOperand()); 5017 5018 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5019 NewLD.getValue(0))); 5020 Results.push_back(NewLD.getValue(1)); 5021 } 5022 } 5023 } 5024 } 5025 5026 void NVPTXTargetLowering::ReplaceNodeResults( 5027 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5028 switch (N->getOpcode()) { 5029 default: 5030 report_fatal_error("Unhandled custom legalization"); 5031 case ISD::LOAD: 5032 ReplaceLoadVector(N, DAG, Results); 5033 return; 5034 case ISD::INTRINSIC_W_CHAIN: 5035 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5036 return; 5037 } 5038 } 5039 5040 // Pin NVPTXTargetObjectFile's vtables to this file. 5041 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {} 5042 5043 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5044 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5045 return getDataSection(); 5046 } 5047