1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/SelectionDAG.h" 29 #include "llvm/CodeGen/SelectionDAGNodes.h" 30 #include "llvm/CodeGen/TargetCallingConv.h" 31 #include "llvm/CodeGen/TargetLowering.h" 32 #include "llvm/CodeGen/ValueTypes.h" 33 #include "llvm/IR/Argument.h" 34 #include "llvm/IR/Attributes.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/FPEnv.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/GlobalValue.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicsNVPTX.h" 44 #include "llvm/IR/Module.h" 45 #include "llvm/IR/Type.h" 46 #include "llvm/IR/Value.h" 47 #include "llvm/Support/Casting.h" 48 #include "llvm/Support/CodeGen.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Support/MachineValueType.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <iterator> 59 #include <sstream> 60 #include <string> 61 #include <utility> 62 #include <vector> 63 64 #define DEBUG_TYPE "nvptx-lower" 65 66 using namespace llvm; 67 68 static std::atomic<unsigned> GlobalUniqueCallSite; 69 70 static cl::opt<bool> sched4reg( 71 "nvptx-sched4reg", 72 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 73 74 static cl::opt<unsigned> FMAContractLevelOpt( 75 "nvptx-fma-level", cl::Hidden, 76 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 77 " 1: do it 2: do it aggressively"), 78 cl::init(2)); 79 80 static cl::opt<int> UsePrecDivF32( 81 "nvptx-prec-divf32", cl::Hidden, 82 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 83 " IEEE Compliant F32 div.rnd if available."), 84 cl::init(2)); 85 86 static cl::opt<bool> UsePrecSqrtF32( 87 "nvptx-prec-sqrtf32", cl::Hidden, 88 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 89 cl::init(true)); 90 91 int NVPTXTargetLowering::getDivF32Level() const { 92 if (UsePrecDivF32.getNumOccurrences() > 0) { 93 // If nvptx-prec-div32=N is used on the command-line, always honor it 94 return UsePrecDivF32; 95 } else { 96 // Otherwise, use div.approx if fast math is enabled 97 if (getTargetMachine().Options.UnsafeFPMath) 98 return 0; 99 else 100 return 2; 101 } 102 } 103 104 bool NVPTXTargetLowering::usePrecSqrtF32() const { 105 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 106 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 107 return UsePrecSqrtF32; 108 } else { 109 // Otherwise, use sqrt.approx if fast math is enabled 110 return !getTargetMachine().Options.UnsafeFPMath; 111 } 112 } 113 114 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 115 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 116 DenormalMode::PreserveSign; 117 } 118 119 static bool IsPTXVectorType(MVT VT) { 120 switch (VT.SimpleTy) { 121 default: 122 return false; 123 case MVT::v2i1: 124 case MVT::v4i1: 125 case MVT::v2i8: 126 case MVT::v4i8: 127 case MVT::v2i16: 128 case MVT::v4i16: 129 case MVT::v2i32: 130 case MVT::v4i32: 131 case MVT::v2i64: 132 case MVT::v2f16: 133 case MVT::v4f16: 134 case MVT::v8f16: // <4 x f16x2> 135 case MVT::v2f32: 136 case MVT::v4f32: 137 case MVT::v2f64: 138 return true; 139 } 140 } 141 142 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 143 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 144 /// into their primitive components. 145 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 146 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 147 /// LowerCall, and LowerReturn. 148 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 149 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 150 SmallVectorImpl<uint64_t> *Offsets = nullptr, 151 uint64_t StartingOffset = 0) { 152 SmallVector<EVT, 16> TempVTs; 153 SmallVector<uint64_t, 16> TempOffsets; 154 155 // Special case for i128 - decompose to (i64, i64) 156 if (Ty->isIntegerTy(128)) { 157 ValueVTs.push_back(EVT(MVT::i64)); 158 ValueVTs.push_back(EVT(MVT::i64)); 159 160 if (Offsets) { 161 Offsets->push_back(StartingOffset + 0); 162 Offsets->push_back(StartingOffset + 8); 163 } 164 165 return; 166 } 167 168 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 169 if (StructType *STy = dyn_cast<StructType>(Ty)) { 170 auto const *SL = DL.getStructLayout(STy); 171 auto ElementNum = 0; 172 for(auto *EI : STy->elements()) { 173 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 174 StartingOffset + SL->getElementOffset(ElementNum)); 175 ++ElementNum; 176 } 177 return; 178 } 179 180 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 181 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 182 EVT VT = TempVTs[i]; 183 uint64_t Off = TempOffsets[i]; 184 // Split vectors into individual elements, except for v2f16, which 185 // we will pass as a single scalar. 186 if (VT.isVector()) { 187 unsigned NumElts = VT.getVectorNumElements(); 188 EVT EltVT = VT.getVectorElementType(); 189 // Vectors with an even number of f16 elements will be passed to 190 // us as an array of v2f16 elements. We must match this so we 191 // stay in sync with Ins/Outs. 192 if (EltVT == MVT::f16 && NumElts % 2 == 0) { 193 EltVT = MVT::v2f16; 194 NumElts /= 2; 195 } 196 for (unsigned j = 0; j != NumElts; ++j) { 197 ValueVTs.push_back(EltVT); 198 if (Offsets) 199 Offsets->push_back(Off + j * EltVT.getStoreSize()); 200 } 201 } else { 202 ValueVTs.push_back(VT); 203 if (Offsets) 204 Offsets->push_back(Off); 205 } 206 } 207 } 208 209 /// PromoteScalarIntegerPTX 210 /// Used to make sure the arguments/returns are suitable for passing 211 /// and promote them to a larger size if they're not. 212 /// 213 /// The promoted type is placed in \p PromoteVT if the function returns true. 214 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 215 if (VT.isScalarInteger()) { 216 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 217 default: 218 llvm_unreachable( 219 "Promotion is not suitable for scalars of size larger than 64-bits"); 220 case 1: 221 *PromotedVT = MVT::i1; 222 break; 223 case 2: 224 case 4: 225 case 8: 226 *PromotedVT = MVT::i8; 227 break; 228 case 16: 229 *PromotedVT = MVT::i16; 230 break; 231 case 32: 232 *PromotedVT = MVT::i32; 233 break; 234 case 64: 235 *PromotedVT = MVT::i64; 236 break; 237 } 238 return EVT(*PromotedVT) != VT; 239 } 240 return false; 241 } 242 243 // Check whether we can merge loads/stores of some of the pieces of a 244 // flattened function parameter or return value into a single vector 245 // load/store. 246 // 247 // The flattened parameter is represented as a list of EVTs and 248 // offsets, and the whole structure is aligned to ParamAlignment. This 249 // function determines whether we can load/store pieces of the 250 // parameter starting at index Idx using a single vectorized op of 251 // size AccessSize. If so, it returns the number of param pieces 252 // covered by the vector op. Otherwise, it returns 1. 253 static unsigned CanMergeParamLoadStoresStartingAt( 254 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 255 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 256 257 // Can't vectorize if param alignment is not sufficient. 258 if (ParamAlignment < AccessSize) 259 return 1; 260 // Can't vectorize if offset is not aligned. 261 if (Offsets[Idx] & (AccessSize - 1)) 262 return 1; 263 264 EVT EltVT = ValueVTs[Idx]; 265 unsigned EltSize = EltVT.getStoreSize(); 266 267 // Element is too large to vectorize. 268 if (EltSize >= AccessSize) 269 return 1; 270 271 unsigned NumElts = AccessSize / EltSize; 272 // Can't vectorize if AccessBytes if not a multiple of EltSize. 273 if (AccessSize != EltSize * NumElts) 274 return 1; 275 276 // We don't have enough elements to vectorize. 277 if (Idx + NumElts > ValueVTs.size()) 278 return 1; 279 280 // PTX ISA can only deal with 2- and 4-element vector ops. 281 if (NumElts != 4 && NumElts != 2) 282 return 1; 283 284 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 285 // Types do not match. 286 if (ValueVTs[j] != EltVT) 287 return 1; 288 289 // Elements are not contiguous. 290 if (Offsets[j] - Offsets[j - 1] != EltSize) 291 return 1; 292 } 293 // OK. We can vectorize ValueVTs[i..i+NumElts) 294 return NumElts; 295 } 296 297 // Flags for tracking per-element vectorization state of loads/stores 298 // of a flattened function parameter or return value. 299 enum ParamVectorizationFlags { 300 PVF_INNER = 0x0, // Middle elements of a vector. 301 PVF_FIRST = 0x1, // First element of the vector. 302 PVF_LAST = 0x2, // Last element of the vector. 303 // Scalar is effectively a 1-element vector. 304 PVF_SCALAR = PVF_FIRST | PVF_LAST 305 }; 306 307 // Computes whether and how we can vectorize the loads/stores of a 308 // flattened function parameter or return value. 309 // 310 // The flattened parameter is represented as the list of ValueVTs and 311 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 312 // of the same size as ValueVTs indicating how each piece should be 313 // loaded/stored (i.e. as a scalar, or as part of a vector 314 // load/store). 315 static SmallVector<ParamVectorizationFlags, 16> 316 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 317 const SmallVectorImpl<uint64_t> &Offsets, 318 Align ParamAlignment) { 319 // Set vector size to match ValueVTs and mark all elements as 320 // scalars by default. 321 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 322 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 323 324 // Check what we can vectorize using 128/64/32-bit accesses. 325 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 326 // Skip elements we've already processed. 327 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 328 for (unsigned AccessSize : {16, 8, 4, 2}) { 329 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 330 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 331 // Mark vectorized elements. 332 switch (NumElts) { 333 default: 334 llvm_unreachable("Unexpected return value"); 335 case 1: 336 // Can't vectorize using this size, try next smaller size. 337 continue; 338 case 2: 339 assert(I + 1 < E && "Not enough elements."); 340 VectorInfo[I] = PVF_FIRST; 341 VectorInfo[I + 1] = PVF_LAST; 342 I += 1; 343 break; 344 case 4: 345 assert(I + 3 < E && "Not enough elements."); 346 VectorInfo[I] = PVF_FIRST; 347 VectorInfo[I + 1] = PVF_INNER; 348 VectorInfo[I + 2] = PVF_INNER; 349 VectorInfo[I + 3] = PVF_LAST; 350 I += 3; 351 break; 352 } 353 // Break out of the inner loop because we've already succeeded 354 // using largest possible AccessSize. 355 break; 356 } 357 } 358 return VectorInfo; 359 } 360 361 // NVPTXTargetLowering Constructor. 362 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 363 const NVPTXSubtarget &STI) 364 : TargetLowering(TM), nvTM(&TM), STI(STI) { 365 // always lower memset, memcpy, and memmove intrinsics to load/store 366 // instructions, rather 367 // then generating calls to memset, mempcy or memmove. 368 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 369 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 370 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 371 372 setBooleanContents(ZeroOrNegativeOneBooleanContent); 373 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 374 375 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 376 // condition branches. 377 setJumpIsExpensive(true); 378 379 // Wide divides are _very_ slow. Try to reduce the width of the divide if 380 // possible. 381 addBypassSlowDiv(64, 32); 382 383 // By default, use the Source scheduling 384 if (sched4reg) 385 setSchedulingPreference(Sched::RegPressure); 386 else 387 setSchedulingPreference(Sched::Source); 388 389 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 390 LegalizeAction NoF16Action) { 391 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 392 }; 393 394 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 395 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 396 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 397 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 398 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 399 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 400 addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass); 401 addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass); 402 403 // Conversion to/from FP16/FP16x2 is always legal. 404 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 405 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 406 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 407 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 408 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 409 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 410 411 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 412 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 413 414 // Operations not directly supported by NVPTX. 415 for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8, 416 MVT::i16, MVT::i32, MVT::i64}) { 417 setOperationAction(ISD::SELECT_CC, VT, Expand); 418 setOperationAction(ISD::BR_CC, VT, Expand); 419 } 420 421 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 422 // For others we will expand to a SHL/SRA pair. 423 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 424 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 425 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 426 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 427 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 428 429 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 430 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 431 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 432 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 433 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 434 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 435 436 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 437 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 438 439 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 440 // that don't have h/w rotation we lower them to multi-instruction assembly. 441 // See ROT*_sw in NVPTXIntrInfo.td 442 setOperationAction(ISD::ROTL, MVT::i64, Legal); 443 setOperationAction(ISD::ROTR, MVT::i64, Legal); 444 setOperationAction(ISD::ROTL, MVT::i32, Legal); 445 setOperationAction(ISD::ROTR, MVT::i32, Legal); 446 447 setOperationAction(ISD::ROTL, MVT::i16, Expand); 448 setOperationAction(ISD::ROTR, MVT::i16, Expand); 449 setOperationAction(ISD::ROTL, MVT::i8, Expand); 450 setOperationAction(ISD::ROTR, MVT::i8, Expand); 451 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 452 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 453 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 454 455 // Indirect branch is not supported. 456 // This also disables Jump Table creation. 457 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 458 setOperationAction(ISD::BRIND, MVT::Other, Expand); 459 460 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 461 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 462 463 // We want to legalize constant related memmove and memcopy 464 // intrinsics. 465 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 466 467 // Turn FP extload into load/fpextend 468 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 469 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 470 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 471 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 472 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 473 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 474 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 475 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 476 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 477 // Turn FP truncstore into trunc + store. 478 // FIXME: vector types should also be expanded 479 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 480 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 481 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 482 483 // PTX does not support load / store predicate registers 484 setOperationAction(ISD::LOAD, MVT::i1, Custom); 485 setOperationAction(ISD::STORE, MVT::i1, Custom); 486 487 for (MVT VT : MVT::integer_valuetypes()) { 488 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 489 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 490 setTruncStoreAction(VT, MVT::i1, Expand); 491 } 492 493 // This is legal in NVPTX 494 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 495 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 496 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 497 498 // TRAP can be lowered to PTX trap 499 setOperationAction(ISD::TRAP, MVT::Other, Legal); 500 501 // Register custom handling for vector loads/stores 502 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 503 if (IsPTXVectorType(VT)) { 504 setOperationAction(ISD::LOAD, VT, Custom); 505 setOperationAction(ISD::STORE, VT, Custom); 506 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 507 } 508 } 509 510 // Custom handling for i8 intrinsics 511 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 512 513 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 514 setOperationAction(ISD::ABS, Ty, Legal); 515 setOperationAction(ISD::SMIN, Ty, Legal); 516 setOperationAction(ISD::SMAX, Ty, Legal); 517 setOperationAction(ISD::UMIN, Ty, Legal); 518 setOperationAction(ISD::UMAX, Ty, Legal); 519 520 setOperationAction(ISD::CTPOP, Ty, Legal); 521 setOperationAction(ISD::CTLZ, Ty, Legal); 522 } 523 524 setOperationAction(ISD::ADDC, MVT::i32, Legal); 525 setOperationAction(ISD::ADDE, MVT::i32, Legal); 526 setOperationAction(ISD::SUBC, MVT::i32, Legal); 527 setOperationAction(ISD::SUBE, MVT::i32, Legal); 528 if (STI.getPTXVersion() >= 43) { 529 setOperationAction(ISD::ADDC, MVT::i64, Legal); 530 setOperationAction(ISD::ADDE, MVT::i64, Legal); 531 setOperationAction(ISD::SUBC, MVT::i64, Legal); 532 setOperationAction(ISD::SUBE, MVT::i64, Legal); 533 } 534 535 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 536 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 537 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 538 539 // PTX does not directly support SELP of i1, so promote to i32 first 540 setOperationAction(ISD::SELECT, MVT::i1, Custom); 541 542 // PTX cannot multiply two i64s in a single instruction. 543 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 544 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 545 546 // We have some custom DAG combine patterns for these nodes 547 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL, 548 ISD::SREM, ISD::UREM}); 549 550 // setcc for f16x2 needs special handling to prevent legalizer's 551 // attempt to scalarize it due to v2i1 not being legal. 552 if (STI.allowFP16Math()) 553 setTargetDAGCombine(ISD::SETCC); 554 555 // Promote fp16 arithmetic if fp16 hardware isn't available or the 556 // user passed --nvptx-no-fp16-math. The flag is useful because, 557 // although sm_53+ GPUs have some sort of FP16 support in 558 // hardware, only sm_53 and sm_60 have full implementation. Others 559 // only have token amount of hardware and are likely to run faster 560 // by using fp32 units instead. 561 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 562 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 563 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 564 } 565 566 // There's no neg.f16 instruction. Expand to (0-x). 567 setOperationAction(ISD::FNEG, MVT::f16, Expand); 568 setOperationAction(ISD::FNEG, MVT::v2f16, Expand); 569 570 // (would be) Library functions. 571 572 // These map to conversion instructions for scalar FP types. 573 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 574 ISD::FTRUNC}) { 575 setOperationAction(Op, MVT::f16, Legal); 576 setOperationAction(Op, MVT::f32, Legal); 577 setOperationAction(Op, MVT::f64, Legal); 578 setOperationAction(Op, MVT::v2f16, Expand); 579 } 580 581 setOperationAction(ISD::FROUND, MVT::f16, Promote); 582 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 583 setOperationAction(ISD::FROUND, MVT::f32, Custom); 584 setOperationAction(ISD::FROUND, MVT::f64, Custom); 585 586 587 // 'Expand' implements FCOPYSIGN without calling an external library. 588 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 589 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 590 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 591 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 592 593 // These map to corresponding instructions for f32/f64. f16 must be 594 // promoted to f32. v2f16 is expanded to f16, which is then promoted 595 // to f32. 596 for (const auto &Op : 597 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) { 598 setOperationAction(Op, MVT::f16, Promote); 599 setOperationAction(Op, MVT::f32, Legal); 600 setOperationAction(Op, MVT::f64, Legal); 601 setOperationAction(Op, MVT::v2f16, Expand); 602 } 603 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 604 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 605 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 606 return IsAtLeastSm80 ? Legal : NotSm80Action; 607 }; 608 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 609 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 610 setOperationAction(Op, MVT::f32, Legal); 611 setOperationAction(Op, MVT::f64, Legal); 612 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 613 } 614 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 615 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 616 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 617 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 618 } 619 620 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 621 // No FPOW or FREM in PTX. 622 623 // Now deduce the information based on the above mentioned 624 // actions 625 computeRegisterProperties(STI.getRegisterInfo()); 626 627 setMinCmpXchgSizeInBits(32); 628 } 629 630 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 631 switch ((NVPTXISD::NodeType)Opcode) { 632 case NVPTXISD::FIRST_NUMBER: 633 break; 634 case NVPTXISD::CALL: 635 return "NVPTXISD::CALL"; 636 case NVPTXISD::RET_FLAG: 637 return "NVPTXISD::RET_FLAG"; 638 case NVPTXISD::LOAD_PARAM: 639 return "NVPTXISD::LOAD_PARAM"; 640 case NVPTXISD::Wrapper: 641 return "NVPTXISD::Wrapper"; 642 case NVPTXISD::DeclareParam: 643 return "NVPTXISD::DeclareParam"; 644 case NVPTXISD::DeclareScalarParam: 645 return "NVPTXISD::DeclareScalarParam"; 646 case NVPTXISD::DeclareRet: 647 return "NVPTXISD::DeclareRet"; 648 case NVPTXISD::DeclareScalarRet: 649 return "NVPTXISD::DeclareScalarRet"; 650 case NVPTXISD::DeclareRetParam: 651 return "NVPTXISD::DeclareRetParam"; 652 case NVPTXISD::PrintCall: 653 return "NVPTXISD::PrintCall"; 654 case NVPTXISD::PrintConvergentCall: 655 return "NVPTXISD::PrintConvergentCall"; 656 case NVPTXISD::PrintCallUni: 657 return "NVPTXISD::PrintCallUni"; 658 case NVPTXISD::PrintConvergentCallUni: 659 return "NVPTXISD::PrintConvergentCallUni"; 660 case NVPTXISD::LoadParam: 661 return "NVPTXISD::LoadParam"; 662 case NVPTXISD::LoadParamV2: 663 return "NVPTXISD::LoadParamV2"; 664 case NVPTXISD::LoadParamV4: 665 return "NVPTXISD::LoadParamV4"; 666 case NVPTXISD::StoreParam: 667 return "NVPTXISD::StoreParam"; 668 case NVPTXISD::StoreParamV2: 669 return "NVPTXISD::StoreParamV2"; 670 case NVPTXISD::StoreParamV4: 671 return "NVPTXISD::StoreParamV4"; 672 case NVPTXISD::StoreParamS32: 673 return "NVPTXISD::StoreParamS32"; 674 case NVPTXISD::StoreParamU32: 675 return "NVPTXISD::StoreParamU32"; 676 case NVPTXISD::CallArgBegin: 677 return "NVPTXISD::CallArgBegin"; 678 case NVPTXISD::CallArg: 679 return "NVPTXISD::CallArg"; 680 case NVPTXISD::LastCallArg: 681 return "NVPTXISD::LastCallArg"; 682 case NVPTXISD::CallArgEnd: 683 return "NVPTXISD::CallArgEnd"; 684 case NVPTXISD::CallVoid: 685 return "NVPTXISD::CallVoid"; 686 case NVPTXISD::CallVal: 687 return "NVPTXISD::CallVal"; 688 case NVPTXISD::CallSymbol: 689 return "NVPTXISD::CallSymbol"; 690 case NVPTXISD::Prototype: 691 return "NVPTXISD::Prototype"; 692 case NVPTXISD::MoveParam: 693 return "NVPTXISD::MoveParam"; 694 case NVPTXISD::StoreRetval: 695 return "NVPTXISD::StoreRetval"; 696 case NVPTXISD::StoreRetvalV2: 697 return "NVPTXISD::StoreRetvalV2"; 698 case NVPTXISD::StoreRetvalV4: 699 return "NVPTXISD::StoreRetvalV4"; 700 case NVPTXISD::PseudoUseParam: 701 return "NVPTXISD::PseudoUseParam"; 702 case NVPTXISD::RETURN: 703 return "NVPTXISD::RETURN"; 704 case NVPTXISD::CallSeqBegin: 705 return "NVPTXISD::CallSeqBegin"; 706 case NVPTXISD::CallSeqEnd: 707 return "NVPTXISD::CallSeqEnd"; 708 case NVPTXISD::CallPrototype: 709 return "NVPTXISD::CallPrototype"; 710 case NVPTXISD::ProxyReg: 711 return "NVPTXISD::ProxyReg"; 712 case NVPTXISD::LoadV2: 713 return "NVPTXISD::LoadV2"; 714 case NVPTXISD::LoadV4: 715 return "NVPTXISD::LoadV4"; 716 case NVPTXISD::LDGV2: 717 return "NVPTXISD::LDGV2"; 718 case NVPTXISD::LDGV4: 719 return "NVPTXISD::LDGV4"; 720 case NVPTXISD::LDUV2: 721 return "NVPTXISD::LDUV2"; 722 case NVPTXISD::LDUV4: 723 return "NVPTXISD::LDUV4"; 724 case NVPTXISD::StoreV2: 725 return "NVPTXISD::StoreV2"; 726 case NVPTXISD::StoreV4: 727 return "NVPTXISD::StoreV4"; 728 case NVPTXISD::FUN_SHFL_CLAMP: 729 return "NVPTXISD::FUN_SHFL_CLAMP"; 730 case NVPTXISD::FUN_SHFR_CLAMP: 731 return "NVPTXISD::FUN_SHFR_CLAMP"; 732 case NVPTXISD::IMAD: 733 return "NVPTXISD::IMAD"; 734 case NVPTXISD::SETP_F16X2: 735 return "NVPTXISD::SETP_F16X2"; 736 case NVPTXISD::Dummy: 737 return "NVPTXISD::Dummy"; 738 case NVPTXISD::MUL_WIDE_SIGNED: 739 return "NVPTXISD::MUL_WIDE_SIGNED"; 740 case NVPTXISD::MUL_WIDE_UNSIGNED: 741 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 742 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 743 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 744 case NVPTXISD::Tex1DFloatFloatLevel: 745 return "NVPTXISD::Tex1DFloatFloatLevel"; 746 case NVPTXISD::Tex1DFloatFloatGrad: 747 return "NVPTXISD::Tex1DFloatFloatGrad"; 748 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 749 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 750 case NVPTXISD::Tex1DS32FloatLevel: 751 return "NVPTXISD::Tex1DS32FloatLevel"; 752 case NVPTXISD::Tex1DS32FloatGrad: 753 return "NVPTXISD::Tex1DS32FloatGrad"; 754 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 755 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 756 case NVPTXISD::Tex1DU32FloatLevel: 757 return "NVPTXISD::Tex1DU32FloatLevel"; 758 case NVPTXISD::Tex1DU32FloatGrad: 759 return "NVPTXISD::Tex1DU32FloatGrad"; 760 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 761 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 762 case NVPTXISD::Tex1DArrayFloatFloatLevel: 763 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 764 case NVPTXISD::Tex1DArrayFloatFloatGrad: 765 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 766 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 767 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 768 case NVPTXISD::Tex1DArrayS32FloatLevel: 769 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 770 case NVPTXISD::Tex1DArrayS32FloatGrad: 771 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 772 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 773 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 774 case NVPTXISD::Tex1DArrayU32FloatLevel: 775 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 776 case NVPTXISD::Tex1DArrayU32FloatGrad: 777 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 778 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 779 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 780 case NVPTXISD::Tex2DFloatFloatLevel: 781 return "NVPTXISD::Tex2DFloatFloatLevel"; 782 case NVPTXISD::Tex2DFloatFloatGrad: 783 return "NVPTXISD::Tex2DFloatFloatGrad"; 784 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 785 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 786 case NVPTXISD::Tex2DS32FloatLevel: 787 return "NVPTXISD::Tex2DS32FloatLevel"; 788 case NVPTXISD::Tex2DS32FloatGrad: 789 return "NVPTXISD::Tex2DS32FloatGrad"; 790 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 791 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 792 case NVPTXISD::Tex2DU32FloatLevel: 793 return "NVPTXISD::Tex2DU32FloatLevel"; 794 case NVPTXISD::Tex2DU32FloatGrad: 795 return "NVPTXISD::Tex2DU32FloatGrad"; 796 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 797 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 798 case NVPTXISD::Tex2DArrayFloatFloatLevel: 799 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 800 case NVPTXISD::Tex2DArrayFloatFloatGrad: 801 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 802 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 803 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 804 case NVPTXISD::Tex2DArrayS32FloatLevel: 805 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 806 case NVPTXISD::Tex2DArrayS32FloatGrad: 807 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 808 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 809 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 810 case NVPTXISD::Tex2DArrayU32FloatLevel: 811 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 812 case NVPTXISD::Tex2DArrayU32FloatGrad: 813 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 814 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 815 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 816 case NVPTXISD::Tex3DFloatFloatLevel: 817 return "NVPTXISD::Tex3DFloatFloatLevel"; 818 case NVPTXISD::Tex3DFloatFloatGrad: 819 return "NVPTXISD::Tex3DFloatFloatGrad"; 820 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 821 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 822 case NVPTXISD::Tex3DS32FloatLevel: 823 return "NVPTXISD::Tex3DS32FloatLevel"; 824 case NVPTXISD::Tex3DS32FloatGrad: 825 return "NVPTXISD::Tex3DS32FloatGrad"; 826 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 827 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 828 case NVPTXISD::Tex3DU32FloatLevel: 829 return "NVPTXISD::Tex3DU32FloatLevel"; 830 case NVPTXISD::Tex3DU32FloatGrad: 831 return "NVPTXISD::Tex3DU32FloatGrad"; 832 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 833 case NVPTXISD::TexCubeFloatFloatLevel: 834 return "NVPTXISD::TexCubeFloatFloatLevel"; 835 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 836 case NVPTXISD::TexCubeS32FloatLevel: 837 return "NVPTXISD::TexCubeS32FloatLevel"; 838 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 839 case NVPTXISD::TexCubeU32FloatLevel: 840 return "NVPTXISD::TexCubeU32FloatLevel"; 841 case NVPTXISD::TexCubeArrayFloatFloat: 842 return "NVPTXISD::TexCubeArrayFloatFloat"; 843 case NVPTXISD::TexCubeArrayFloatFloatLevel: 844 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 845 case NVPTXISD::TexCubeArrayS32Float: 846 return "NVPTXISD::TexCubeArrayS32Float"; 847 case NVPTXISD::TexCubeArrayS32FloatLevel: 848 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 849 case NVPTXISD::TexCubeArrayU32Float: 850 return "NVPTXISD::TexCubeArrayU32Float"; 851 case NVPTXISD::TexCubeArrayU32FloatLevel: 852 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 853 case NVPTXISD::Tld4R2DFloatFloat: 854 return "NVPTXISD::Tld4R2DFloatFloat"; 855 case NVPTXISD::Tld4G2DFloatFloat: 856 return "NVPTXISD::Tld4G2DFloatFloat"; 857 case NVPTXISD::Tld4B2DFloatFloat: 858 return "NVPTXISD::Tld4B2DFloatFloat"; 859 case NVPTXISD::Tld4A2DFloatFloat: 860 return "NVPTXISD::Tld4A2DFloatFloat"; 861 case NVPTXISD::Tld4R2DS64Float: 862 return "NVPTXISD::Tld4R2DS64Float"; 863 case NVPTXISD::Tld4G2DS64Float: 864 return "NVPTXISD::Tld4G2DS64Float"; 865 case NVPTXISD::Tld4B2DS64Float: 866 return "NVPTXISD::Tld4B2DS64Float"; 867 case NVPTXISD::Tld4A2DS64Float: 868 return "NVPTXISD::Tld4A2DS64Float"; 869 case NVPTXISD::Tld4R2DU64Float: 870 return "NVPTXISD::Tld4R2DU64Float"; 871 case NVPTXISD::Tld4G2DU64Float: 872 return "NVPTXISD::Tld4G2DU64Float"; 873 case NVPTXISD::Tld4B2DU64Float: 874 return "NVPTXISD::Tld4B2DU64Float"; 875 case NVPTXISD::Tld4A2DU64Float: 876 return "NVPTXISD::Tld4A2DU64Float"; 877 878 case NVPTXISD::TexUnified1DFloatS32: 879 return "NVPTXISD::TexUnified1DFloatS32"; 880 case NVPTXISD::TexUnified1DFloatFloat: 881 return "NVPTXISD::TexUnified1DFloatFloat"; 882 case NVPTXISD::TexUnified1DFloatFloatLevel: 883 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 884 case NVPTXISD::TexUnified1DFloatFloatGrad: 885 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 886 case NVPTXISD::TexUnified1DS32S32: 887 return "NVPTXISD::TexUnified1DS32S32"; 888 case NVPTXISD::TexUnified1DS32Float: 889 return "NVPTXISD::TexUnified1DS32Float"; 890 case NVPTXISD::TexUnified1DS32FloatLevel: 891 return "NVPTXISD::TexUnified1DS32FloatLevel"; 892 case NVPTXISD::TexUnified1DS32FloatGrad: 893 return "NVPTXISD::TexUnified1DS32FloatGrad"; 894 case NVPTXISD::TexUnified1DU32S32: 895 return "NVPTXISD::TexUnified1DU32S32"; 896 case NVPTXISD::TexUnified1DU32Float: 897 return "NVPTXISD::TexUnified1DU32Float"; 898 case NVPTXISD::TexUnified1DU32FloatLevel: 899 return "NVPTXISD::TexUnified1DU32FloatLevel"; 900 case NVPTXISD::TexUnified1DU32FloatGrad: 901 return "NVPTXISD::TexUnified1DU32FloatGrad"; 902 case NVPTXISD::TexUnified1DArrayFloatS32: 903 return "NVPTXISD::TexUnified1DArrayFloatS32"; 904 case NVPTXISD::TexUnified1DArrayFloatFloat: 905 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 906 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 907 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 908 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 909 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 910 case NVPTXISD::TexUnified1DArrayS32S32: 911 return "NVPTXISD::TexUnified1DArrayS32S32"; 912 case NVPTXISD::TexUnified1DArrayS32Float: 913 return "NVPTXISD::TexUnified1DArrayS32Float"; 914 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 915 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 916 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 917 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 918 case NVPTXISD::TexUnified1DArrayU32S32: 919 return "NVPTXISD::TexUnified1DArrayU32S32"; 920 case NVPTXISD::TexUnified1DArrayU32Float: 921 return "NVPTXISD::TexUnified1DArrayU32Float"; 922 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 923 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 924 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 925 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 926 case NVPTXISD::TexUnified2DFloatS32: 927 return "NVPTXISD::TexUnified2DFloatS32"; 928 case NVPTXISD::TexUnified2DFloatFloat: 929 return "NVPTXISD::TexUnified2DFloatFloat"; 930 case NVPTXISD::TexUnified2DFloatFloatLevel: 931 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 932 case NVPTXISD::TexUnified2DFloatFloatGrad: 933 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 934 case NVPTXISD::TexUnified2DS32S32: 935 return "NVPTXISD::TexUnified2DS32S32"; 936 case NVPTXISD::TexUnified2DS32Float: 937 return "NVPTXISD::TexUnified2DS32Float"; 938 case NVPTXISD::TexUnified2DS32FloatLevel: 939 return "NVPTXISD::TexUnified2DS32FloatLevel"; 940 case NVPTXISD::TexUnified2DS32FloatGrad: 941 return "NVPTXISD::TexUnified2DS32FloatGrad"; 942 case NVPTXISD::TexUnified2DU32S32: 943 return "NVPTXISD::TexUnified2DU32S32"; 944 case NVPTXISD::TexUnified2DU32Float: 945 return "NVPTXISD::TexUnified2DU32Float"; 946 case NVPTXISD::TexUnified2DU32FloatLevel: 947 return "NVPTXISD::TexUnified2DU32FloatLevel"; 948 case NVPTXISD::TexUnified2DU32FloatGrad: 949 return "NVPTXISD::TexUnified2DU32FloatGrad"; 950 case NVPTXISD::TexUnified2DArrayFloatS32: 951 return "NVPTXISD::TexUnified2DArrayFloatS32"; 952 case NVPTXISD::TexUnified2DArrayFloatFloat: 953 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 954 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 955 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 956 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 957 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 958 case NVPTXISD::TexUnified2DArrayS32S32: 959 return "NVPTXISD::TexUnified2DArrayS32S32"; 960 case NVPTXISD::TexUnified2DArrayS32Float: 961 return "NVPTXISD::TexUnified2DArrayS32Float"; 962 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 963 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 964 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 965 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 966 case NVPTXISD::TexUnified2DArrayU32S32: 967 return "NVPTXISD::TexUnified2DArrayU32S32"; 968 case NVPTXISD::TexUnified2DArrayU32Float: 969 return "NVPTXISD::TexUnified2DArrayU32Float"; 970 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 971 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 972 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 973 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 974 case NVPTXISD::TexUnified3DFloatS32: 975 return "NVPTXISD::TexUnified3DFloatS32"; 976 case NVPTXISD::TexUnified3DFloatFloat: 977 return "NVPTXISD::TexUnified3DFloatFloat"; 978 case NVPTXISD::TexUnified3DFloatFloatLevel: 979 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 980 case NVPTXISD::TexUnified3DFloatFloatGrad: 981 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 982 case NVPTXISD::TexUnified3DS32S32: 983 return "NVPTXISD::TexUnified3DS32S32"; 984 case NVPTXISD::TexUnified3DS32Float: 985 return "NVPTXISD::TexUnified3DS32Float"; 986 case NVPTXISD::TexUnified3DS32FloatLevel: 987 return "NVPTXISD::TexUnified3DS32FloatLevel"; 988 case NVPTXISD::TexUnified3DS32FloatGrad: 989 return "NVPTXISD::TexUnified3DS32FloatGrad"; 990 case NVPTXISD::TexUnified3DU32S32: 991 return "NVPTXISD::TexUnified3DU32S32"; 992 case NVPTXISD::TexUnified3DU32Float: 993 return "NVPTXISD::TexUnified3DU32Float"; 994 case NVPTXISD::TexUnified3DU32FloatLevel: 995 return "NVPTXISD::TexUnified3DU32FloatLevel"; 996 case NVPTXISD::TexUnified3DU32FloatGrad: 997 return "NVPTXISD::TexUnified3DU32FloatGrad"; 998 case NVPTXISD::TexUnifiedCubeFloatFloat: 999 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1000 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1001 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1002 case NVPTXISD::TexUnifiedCubeS32Float: 1003 return "NVPTXISD::TexUnifiedCubeS32Float"; 1004 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1005 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1006 case NVPTXISD::TexUnifiedCubeU32Float: 1007 return "NVPTXISD::TexUnifiedCubeU32Float"; 1008 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1009 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1010 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1011 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1012 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1013 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1014 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1015 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1016 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1017 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1018 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1019 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1020 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1021 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1022 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1023 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1024 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1025 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1026 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1027 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1028 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1029 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1030 case NVPTXISD::Tld4UnifiedR2DS64Float: 1031 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1032 case NVPTXISD::Tld4UnifiedG2DS64Float: 1033 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1034 case NVPTXISD::Tld4UnifiedB2DS64Float: 1035 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1036 case NVPTXISD::Tld4UnifiedA2DS64Float: 1037 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1038 case NVPTXISD::Tld4UnifiedR2DU64Float: 1039 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1040 case NVPTXISD::Tld4UnifiedG2DU64Float: 1041 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1042 case NVPTXISD::Tld4UnifiedB2DU64Float: 1043 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1044 case NVPTXISD::Tld4UnifiedA2DU64Float: 1045 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1046 1047 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1048 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1049 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1050 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1051 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1052 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1053 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1054 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1055 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1056 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1057 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1058 1059 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1060 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1061 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1062 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1063 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1064 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1065 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1066 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1067 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1068 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1069 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1070 1071 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1072 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1073 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1074 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1075 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1076 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1077 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1078 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1079 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1080 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1081 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1082 1083 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1084 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1085 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1086 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1087 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1088 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1089 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1090 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1091 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1092 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1093 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1094 1095 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1096 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1097 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1098 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1099 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1100 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1101 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1102 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1103 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1104 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1105 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1106 1107 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1108 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1109 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1110 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1111 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1112 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1113 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1114 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1115 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1116 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1117 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1118 1119 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1120 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1121 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1122 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1123 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1124 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1125 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1126 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1127 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1128 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1129 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1130 1131 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1132 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1133 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1134 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1135 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1136 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1137 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1138 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1139 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1140 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1141 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1142 1143 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1144 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1145 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1146 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1147 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1148 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1149 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1150 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1151 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1152 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1153 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1154 1155 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1156 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1157 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1158 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1159 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1160 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1161 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1162 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1163 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1164 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1165 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1166 1167 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1168 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1169 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1170 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1171 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1172 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1173 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1174 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1175 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1176 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1177 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1178 1179 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1180 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1181 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1182 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1183 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1184 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1185 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1186 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1187 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1188 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1189 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1190 1191 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1192 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1193 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1194 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1195 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1196 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1197 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1198 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1199 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1200 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1201 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1202 1203 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1204 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1205 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1206 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1207 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1208 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1209 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1210 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1211 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1212 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1213 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1214 1215 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1216 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1217 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1218 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1219 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1220 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1221 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1222 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1223 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1224 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1225 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1226 } 1227 return nullptr; 1228 } 1229 1230 TargetLoweringBase::LegalizeTypeAction 1231 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1232 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1233 VT.getScalarType() == MVT::i1) 1234 return TypeSplitVector; 1235 if (VT == MVT::v2f16) 1236 return TypeLegal; 1237 return TargetLoweringBase::getPreferredVectorAction(VT); 1238 } 1239 1240 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1241 int Enabled, int &ExtraSteps, 1242 bool &UseOneConst, 1243 bool Reciprocal) const { 1244 if (!(Enabled == ReciprocalEstimate::Enabled || 1245 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1246 return SDValue(); 1247 1248 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1249 ExtraSteps = 0; 1250 1251 SDLoc DL(Operand); 1252 EVT VT = Operand.getValueType(); 1253 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1254 1255 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1256 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1257 DAG.getConstant(IID, DL, MVT::i32), Operand); 1258 }; 1259 1260 // The sqrt and rsqrt refinement processes assume we always start out with an 1261 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1262 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1263 // any refinement, we must return a regular sqrt. 1264 if (Reciprocal || ExtraSteps > 0) { 1265 if (VT == MVT::f32) 1266 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1267 : Intrinsic::nvvm_rsqrt_approx_f); 1268 else if (VT == MVT::f64) 1269 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1270 else 1271 return SDValue(); 1272 } else { 1273 if (VT == MVT::f32) 1274 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1275 : Intrinsic::nvvm_sqrt_approx_f); 1276 else { 1277 // There's no sqrt.approx.f64 instruction, so we emit 1278 // reciprocal(rsqrt(x)). This is faster than 1279 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1280 // x * rsqrt(x).) 1281 return DAG.getNode( 1282 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1283 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1284 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1285 } 1286 } 1287 } 1288 1289 SDValue 1290 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1291 SDLoc dl(Op); 1292 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1293 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1294 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1295 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1296 } 1297 1298 std::string NVPTXTargetLowering::getPrototype( 1299 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1300 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1301 const CallBase &CB, unsigned UniqueCallSite) const { 1302 auto PtrVT = getPointerTy(DL); 1303 1304 bool isABI = (STI.getSmVersion() >= 20); 1305 assert(isABI && "Non-ABI compilation is not supported"); 1306 if (!isABI) 1307 return ""; 1308 1309 std::stringstream O; 1310 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1311 1312 if (retTy->getTypeID() == Type::VoidTyID) { 1313 O << "()"; 1314 } else { 1315 O << "("; 1316 if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) { 1317 unsigned size = 0; 1318 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1319 size = ITy->getBitWidth(); 1320 } else { 1321 assert(retTy->isFloatingPointTy() && 1322 "Floating point type expected here"); 1323 size = retTy->getPrimitiveSizeInBits(); 1324 } 1325 // PTX ABI requires all scalar return values to be at least 32 1326 // bits in size. fp16 normally uses .b16 as its storage type in 1327 // PTX, so its size must be adjusted here, too. 1328 size = promoteScalarArgumentSize(size); 1329 1330 O << ".param .b" << size << " _"; 1331 } else if (isa<PointerType>(retTy)) { 1332 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1333 } else if (retTy->isAggregateType() || retTy->isVectorTy() || 1334 retTy->isIntegerTy(128)) { 1335 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1336 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1337 } else { 1338 llvm_unreachable("Unknown return type"); 1339 } 1340 O << ") "; 1341 } 1342 O << "_ ("; 1343 1344 bool first = true; 1345 1346 const Function *F = CB.getFunction(); 1347 for (unsigned i = 0, e = Args.size(), OIdx = 0; i != e; ++i, ++OIdx) { 1348 Type *Ty = Args[i].Ty; 1349 if (!first) { 1350 O << ", "; 1351 } 1352 first = false; 1353 1354 if (!Outs[OIdx].Flags.isByVal()) { 1355 if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) { 1356 unsigned ParamAlign = 0; 1357 const CallInst *CallI = cast<CallInst>(&CB); 1358 // +1 because index 0 is reserved for return type alignment 1359 if (!getAlign(*CallI, i + 1, ParamAlign)) 1360 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1361 O << ".param .align " << ParamAlign << " .b8 "; 1362 O << "_"; 1363 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1364 // update the index for Outs 1365 SmallVector<EVT, 16> vtparts; 1366 ComputeValueVTs(*this, DL, Ty, vtparts); 1367 if (unsigned len = vtparts.size()) 1368 OIdx += len - 1; 1369 continue; 1370 } 1371 // i8 types in IR will be i16 types in SDAG 1372 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1373 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1374 "type mismatch between callee prototype and arguments"); 1375 // scalar type 1376 unsigned sz = 0; 1377 if (isa<IntegerType>(Ty)) { 1378 sz = cast<IntegerType>(Ty)->getBitWidth(); 1379 sz = promoteScalarArgumentSize(sz); 1380 } else if (isa<PointerType>(Ty)) { 1381 sz = PtrVT.getSizeInBits(); 1382 } else if (Ty->isHalfTy()) 1383 // PTX ABI requires all scalar parameters to be at least 32 1384 // bits in size. fp16 normally uses .b16 as its storage type 1385 // in PTX, so its size must be adjusted here, too. 1386 sz = 32; 1387 else 1388 sz = Ty->getPrimitiveSizeInBits(); 1389 O << ".param .b" << sz << " "; 1390 O << "_"; 1391 continue; 1392 } 1393 1394 Align ParamByValAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1395 1396 // Try to increase alignment. This code matches logic in LowerCall when 1397 // alignment increase is performed to increase vectorization options. 1398 Type *ETy = Args[i].IndirectType; 1399 Align AlignCandidate = getFunctionParamOptimizedAlign(F, ETy, DL); 1400 ParamByValAlign = std::max(ParamByValAlign, AlignCandidate); 1401 1402 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1403 O << "_"; 1404 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1405 } 1406 O << ");"; 1407 return O.str(); 1408 } 1409 1410 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1411 const CallBase *CB, Type *Ty, 1412 unsigned Idx, 1413 const DataLayout &DL) const { 1414 if (!CB) { 1415 // CallSite is zero, fallback to ABI type alignment 1416 return DL.getABITypeAlign(Ty); 1417 } 1418 1419 unsigned Alignment = 0; 1420 const Function *DirectCallee = CB->getCalledFunction(); 1421 1422 if (!DirectCallee) { 1423 // We don't have a direct function symbol, but that may be because of 1424 // constant cast instructions in the call. 1425 1426 // With bitcast'd call targets, the instruction will be the call 1427 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1428 // Check if we have call alignment metadata 1429 if (getAlign(*CI, Idx, Alignment)) 1430 return Align(Alignment); 1431 1432 const Value *CalleeV = CI->getCalledOperand(); 1433 // Ignore any bitcast instructions 1434 while (isa<ConstantExpr>(CalleeV)) { 1435 const ConstantExpr *CE = cast<ConstantExpr>(CalleeV); 1436 if (!CE->isCast()) 1437 break; 1438 // Look through the bitcast 1439 CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0); 1440 } 1441 1442 // We have now looked past all of the bitcasts. Do we finally have a 1443 // Function? 1444 if (const auto *CalleeF = dyn_cast<Function>(CalleeV)) 1445 DirectCallee = CalleeF; 1446 } 1447 } 1448 1449 // Check for function alignment information if we found that the 1450 // ultimate target is a Function 1451 if (DirectCallee) { 1452 if (getAlign(*DirectCallee, Idx, Alignment)) 1453 return Align(Alignment); 1454 // If alignment information is not available, fall back to the 1455 // default function param optimized type alignment 1456 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1457 } 1458 1459 // Call is indirect, fall back to the ABI type alignment 1460 return DL.getABITypeAlign(Ty); 1461 } 1462 1463 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1464 SmallVectorImpl<SDValue> &InVals) const { 1465 SelectionDAG &DAG = CLI.DAG; 1466 SDLoc dl = CLI.DL; 1467 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1468 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1469 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1470 SDValue Chain = CLI.Chain; 1471 SDValue Callee = CLI.Callee; 1472 bool &isTailCall = CLI.IsTailCall; 1473 ArgListTy &Args = CLI.getArgs(); 1474 Type *RetTy = CLI.RetTy; 1475 const CallBase *CB = CLI.CB; 1476 const DataLayout &DL = DAG.getDataLayout(); 1477 1478 bool isABI = (STI.getSmVersion() >= 20); 1479 assert(isABI && "Non-ABI compilation is not supported"); 1480 if (!isABI) 1481 return Chain; 1482 1483 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1484 SDValue TempChain = Chain; 1485 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1486 SDValue InFlag = Chain.getValue(1); 1487 1488 unsigned ParamCount = 0; 1489 // Args.size() and Outs.size() need not match. 1490 // Outs.size() will be larger 1491 // * if there is an aggregate argument with multiple fields (each field 1492 // showing up separately in Outs) 1493 // * if there is a vector argument with more than typical vector-length 1494 // elements (generally if more than 4) where each vector element is 1495 // individually present in Outs. 1496 // So a different index should be used for indexing into Outs/OutVals. 1497 // See similar issue in LowerFormalArguments. 1498 unsigned OIdx = 0; 1499 // Declare the .params or .reg need to pass values 1500 // to the function 1501 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1502 EVT VT = Outs[OIdx].VT; 1503 Type *Ty = Args[i].Ty; 1504 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1505 1506 SmallVector<EVT, 16> VTs; 1507 SmallVector<uint64_t, 16> Offsets; 1508 1509 assert((!IsByVal || Args[i].IndirectType) && 1510 "byval arg must have indirect type"); 1511 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1512 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets); 1513 1514 Align ArgAlign; 1515 if (IsByVal) { 1516 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1517 // so we don't need to worry whether it's naturally aligned or not. 1518 // See TargetLowering::LowerCallTo(). 1519 ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1520 1521 // Try to increase alignment to enhance vectorization options. 1522 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign( 1523 CB->getCalledFunction(), ETy, DL)); 1524 1525 // Enforce minumum alignment of 4 to work around ptxas miscompile 1526 // for sm_50+. See corresponding alignment adjustment in 1527 // emitFunctionParamList() for details. 1528 ArgAlign = std::max(ArgAlign, Align(4)); 1529 } else { 1530 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); 1531 } 1532 1533 unsigned TypeSize = 1534 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1535 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1536 1537 bool NeedAlign; // Does argument declaration specify alignment? 1538 if (IsByVal || 1539 (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128))) { 1540 // declare .param .align <align> .b8 .param<n>[<size>]; 1541 SDValue DeclareParamOps[] = { 1542 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1543 DAG.getConstant(ParamCount, dl, MVT::i32), 1544 DAG.getConstant(TypeSize, dl, MVT::i32), InFlag}; 1545 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1546 DeclareParamOps); 1547 NeedAlign = true; 1548 } else { 1549 // declare .param .b<size> .param<n>; 1550 if (VT.isInteger() || VT.isFloatingPoint()) { 1551 // PTX ABI requires integral types to be at least 32 bits in 1552 // size. FP16 is loaded/stored using i16, so it's handled 1553 // here as well. 1554 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1555 } 1556 SDValue DeclareScalarParamOps[] = { 1557 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1558 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1559 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1560 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1561 DeclareScalarParamOps); 1562 NeedAlign = false; 1563 } 1564 InFlag = Chain.getValue(1); 1565 1566 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1567 // than 32-bits are sign extended or zero extended, depending on 1568 // whether they are signed or unsigned types. This case applies 1569 // only to scalar parameters and not to aggregate values. 1570 bool ExtendIntegerParam = 1571 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1572 1573 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 1574 SmallVector<SDValue, 6> StoreOperands; 1575 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1576 EVT EltVT = VTs[j]; 1577 int CurOffset = Offsets[j]; 1578 MaybeAlign PartAlign; 1579 if (NeedAlign) 1580 PartAlign = commonAlignment(ArgAlign, CurOffset); 1581 1582 // New store. 1583 if (VectorInfo[j] & PVF_FIRST) { 1584 assert(StoreOperands.empty() && "Unfinished preceding store."); 1585 StoreOperands.push_back(Chain); 1586 StoreOperands.push_back(DAG.getConstant(ParamCount, dl, MVT::i32)); 1587 StoreOperands.push_back(DAG.getConstant(CurOffset, dl, MVT::i32)); 1588 } 1589 1590 SDValue StVal = OutVals[OIdx]; 1591 1592 MVT PromotedVT; 1593 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1594 EltVT = EVT(PromotedVT); 1595 } 1596 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1597 llvm::ISD::NodeType Ext = 1598 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1599 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1600 } 1601 1602 if (IsByVal) { 1603 auto PtrVT = getPointerTy(DL); 1604 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1605 DAG.getConstant(CurOffset, dl, PtrVT)); 1606 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1607 PartAlign); 1608 } else if (ExtendIntegerParam) { 1609 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1610 // zext/sext to i32 1611 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1612 : ISD::ZERO_EXTEND, 1613 dl, MVT::i32, StVal); 1614 } 1615 1616 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1617 // Use 16-bit registers for small stores as it's the 1618 // smallest general purpose register size supported by NVPTX. 1619 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1620 } 1621 1622 // Record the value to store. 1623 StoreOperands.push_back(StVal); 1624 1625 if (VectorInfo[j] & PVF_LAST) { 1626 unsigned NumElts = StoreOperands.size() - 3; 1627 NVPTXISD::NodeType Op; 1628 switch (NumElts) { 1629 case 1: 1630 Op = NVPTXISD::StoreParam; 1631 break; 1632 case 2: 1633 Op = NVPTXISD::StoreParamV2; 1634 break; 1635 case 4: 1636 Op = NVPTXISD::StoreParamV4; 1637 break; 1638 default: 1639 llvm_unreachable("Invalid vector info."); 1640 } 1641 1642 StoreOperands.push_back(InFlag); 1643 1644 // Adjust type of the store op if we've extended the scalar 1645 // return value. 1646 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1647 1648 Chain = DAG.getMemIntrinsicNode( 1649 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1650 TheStoreType, MachinePointerInfo(), PartAlign, 1651 MachineMemOperand::MOStore); 1652 InFlag = Chain.getValue(1); 1653 1654 // Cleanup. 1655 StoreOperands.clear(); 1656 } 1657 if (!IsByVal) 1658 ++OIdx; 1659 } 1660 assert(StoreOperands.empty() && "Unfinished parameter store."); 1661 if (!IsByVal && VTs.size() > 0) 1662 --OIdx; 1663 ++ParamCount; 1664 } 1665 1666 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1667 MaybeAlign retAlignment = None; 1668 1669 // Handle Result 1670 if (Ins.size() > 0) { 1671 SmallVector<EVT, 16> resvtparts; 1672 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1673 1674 // Declare 1675 // .param .align 16 .b8 retval0[<size-in-bytes>], or 1676 // .param .b<size-in-bits> retval0 1677 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1678 // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for 1679 // these three types to match the logic in 1680 // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype. 1681 // Plus, this behavior is consistent with nvcc's. 1682 if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() || 1683 (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) { 1684 resultsz = promoteScalarArgumentSize(resultsz); 1685 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1686 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1687 DAG.getConstant(resultsz, dl, MVT::i32), 1688 DAG.getConstant(0, dl, MVT::i32), InFlag }; 1689 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1690 DeclareRetOps); 1691 InFlag = Chain.getValue(1); 1692 } else { 1693 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1694 assert(retAlignment && "retAlignment is guaranteed to be set"); 1695 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1696 SDValue DeclareRetOps[] = { 1697 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1698 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1699 DAG.getConstant(0, dl, MVT::i32), InFlag}; 1700 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1701 DeclareRetOps); 1702 InFlag = Chain.getValue(1); 1703 } 1704 } 1705 1706 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1707 // between them we must rely on the call site value which is valid for 1708 // indirect calls but is always null for libcalls. 1709 bool isIndirectCall = !Func && CB; 1710 1711 if (isa<ExternalSymbolSDNode>(Callee)) { 1712 Function* CalleeFunc = nullptr; 1713 1714 // Try to find the callee in the current module. 1715 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1716 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1717 1718 // Set the "libcall callee" attribute to indicate that the function 1719 // must always have a declaration. 1720 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1721 } 1722 1723 if (isIndirectCall) { 1724 // This is indirect function call case : PTX requires a prototype of the 1725 // form 1726 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1727 // to be emitted, and the label has to used as the last arg of call 1728 // instruction. 1729 // The prototype is embedded in a string and put as the operand for a 1730 // CallPrototype SDNode which will print out to the value of the string. 1731 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1732 std::string Proto = 1733 getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB, UniqueCallSite); 1734 const char *ProtoStr = 1735 nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str(); 1736 SDValue ProtoOps[] = { 1737 Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag, 1738 }; 1739 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1740 InFlag = Chain.getValue(1); 1741 } 1742 // Op to just print "call" 1743 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1744 SDValue PrintCallOps[] = { 1745 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag 1746 }; 1747 // We model convergent calls as separate opcodes. 1748 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1749 if (CLI.IsConvergent) 1750 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1751 : NVPTXISD::PrintConvergentCall; 1752 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1753 InFlag = Chain.getValue(1); 1754 1755 // Ops to print out the function name 1756 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1757 SDValue CallVoidOps[] = { Chain, Callee, InFlag }; 1758 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1759 InFlag = Chain.getValue(1); 1760 1761 // Ops to print out the param list 1762 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1763 SDValue CallArgBeginOps[] = { Chain, InFlag }; 1764 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1765 CallArgBeginOps); 1766 InFlag = Chain.getValue(1); 1767 1768 for (unsigned i = 0, e = ParamCount; i != e; ++i) { 1769 unsigned opcode; 1770 if (i == (e - 1)) 1771 opcode = NVPTXISD::LastCallArg; 1772 else 1773 opcode = NVPTXISD::CallArg; 1774 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1775 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1776 DAG.getConstant(i, dl, MVT::i32), InFlag }; 1777 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1778 InFlag = Chain.getValue(1); 1779 } 1780 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1781 SDValue CallArgEndOps[] = { Chain, 1782 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1783 InFlag }; 1784 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1785 InFlag = Chain.getValue(1); 1786 1787 if (isIndirectCall) { 1788 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1789 SDValue PrototypeOps[] = { 1790 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InFlag}; 1791 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1792 InFlag = Chain.getValue(1); 1793 } 1794 1795 SmallVector<SDValue, 16> ProxyRegOps; 1796 SmallVector<Optional<MVT>, 16> ProxyRegTruncates; 1797 1798 // Generate loads from param memory/moves from registers for result 1799 if (Ins.size() > 0) { 1800 SmallVector<EVT, 16> VTs; 1801 SmallVector<uint64_t, 16> Offsets; 1802 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1803 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1804 1805 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1806 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1807 1808 SmallVector<EVT, 6> LoadVTs; 1809 int VecIdx = -1; // Index of the first element of the vector. 1810 1811 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1812 // 32-bits are sign extended or zero extended, depending on whether 1813 // they are signed or unsigned types. 1814 bool ExtendIntegerRetVal = 1815 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1816 1817 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1818 bool needTruncate = false; 1819 EVT TheLoadType = VTs[i]; 1820 EVT EltType = Ins[i].VT; 1821 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 1822 MVT PromotedVT; 1823 1824 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 1825 TheLoadType = EVT(PromotedVT); 1826 EltType = EVT(PromotedVT); 1827 needTruncate = true; 1828 } 1829 1830 if (ExtendIntegerRetVal) { 1831 TheLoadType = MVT::i32; 1832 EltType = MVT::i32; 1833 needTruncate = true; 1834 } else if (TheLoadType.getSizeInBits() < 16) { 1835 if (VTs[i].isInteger()) 1836 needTruncate = true; 1837 EltType = MVT::i16; 1838 } 1839 1840 // Record index of the very first element of the vector. 1841 if (VectorInfo[i] & PVF_FIRST) { 1842 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1843 VecIdx = i; 1844 } 1845 1846 LoadVTs.push_back(EltType); 1847 1848 if (VectorInfo[i] & PVF_LAST) { 1849 unsigned NumElts = LoadVTs.size(); 1850 LoadVTs.push_back(MVT::Other); 1851 LoadVTs.push_back(MVT::Glue); 1852 NVPTXISD::NodeType Op; 1853 switch (NumElts) { 1854 case 1: 1855 Op = NVPTXISD::LoadParam; 1856 break; 1857 case 2: 1858 Op = NVPTXISD::LoadParamV2; 1859 break; 1860 case 4: 1861 Op = NVPTXISD::LoadParamV4; 1862 break; 1863 default: 1864 llvm_unreachable("Invalid vector info."); 1865 } 1866 1867 SDValue LoadOperands[] = { 1868 Chain, DAG.getConstant(1, dl, MVT::i32), 1869 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; 1870 SDValue RetVal = DAG.getMemIntrinsicNode( 1871 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 1872 MachinePointerInfo(), EltAlign, 1873 MachineMemOperand::MOLoad); 1874 1875 for (unsigned j = 0; j < NumElts; ++j) { 1876 ProxyRegOps.push_back(RetVal.getValue(j)); 1877 1878 if (needTruncate) 1879 ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT)); 1880 else 1881 ProxyRegTruncates.push_back(Optional<MVT>()); 1882 } 1883 1884 Chain = RetVal.getValue(NumElts); 1885 InFlag = RetVal.getValue(NumElts + 1); 1886 1887 // Cleanup 1888 VecIdx = -1; 1889 LoadVTs.clear(); 1890 } 1891 } 1892 } 1893 1894 Chain = DAG.getCALLSEQ_END( 1895 Chain, DAG.getIntPtrConstant(UniqueCallSite, dl, true), 1896 DAG.getIntPtrConstant(UniqueCallSite + 1, dl, true), InFlag, dl); 1897 InFlag = Chain.getValue(1); 1898 1899 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 1900 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 1901 // dangling. 1902 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 1903 SDValue Ret = DAG.getNode( 1904 NVPTXISD::ProxyReg, dl, 1905 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 1906 { Chain, ProxyRegOps[i], InFlag } 1907 ); 1908 1909 Chain = Ret.getValue(1); 1910 InFlag = Ret.getValue(2); 1911 1912 if (ProxyRegTruncates[i]) { 1913 Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].value(), Ret); 1914 } 1915 1916 InVals.push_back(Ret); 1917 } 1918 1919 // set isTailCall to false for now, until we figure out how to express 1920 // tail call optimization in PTX 1921 isTailCall = false; 1922 return Chain; 1923 } 1924 1925 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 1926 // (see LegalizeDAG.cpp). This is slow and uses local memory. 1927 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 1928 SDValue 1929 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 1930 SDNode *Node = Op.getNode(); 1931 SDLoc dl(Node); 1932 SmallVector<SDValue, 8> Ops; 1933 unsigned NumOperands = Node->getNumOperands(); 1934 for (unsigned i = 0; i < NumOperands; ++i) { 1935 SDValue SubOp = Node->getOperand(i); 1936 EVT VVT = SubOp.getNode()->getValueType(0); 1937 EVT EltVT = VVT.getVectorElementType(); 1938 unsigned NumSubElem = VVT.getVectorNumElements(); 1939 for (unsigned j = 0; j < NumSubElem; ++j) { 1940 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 1941 DAG.getIntPtrConstant(j, dl))); 1942 } 1943 } 1944 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 1945 } 1946 1947 // We can init constant f16x2 with a single .b32 move. Normally it 1948 // would get lowered as two constant loads and vector-packing move. 1949 // mov.b16 %h1, 0x4000; 1950 // mov.b16 %h2, 0x3C00; 1951 // mov.b32 %hh2, {%h2, %h1}; 1952 // Instead we want just a constant move: 1953 // mov.b32 %hh2, 0x40003C00 1954 // 1955 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 1956 // generates good SASS in both cases. 1957 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 1958 SelectionDAG &DAG) const { 1959 //return Op; 1960 if (!(Op->getValueType(0) == MVT::v2f16 && 1961 isa<ConstantFPSDNode>(Op->getOperand(0)) && 1962 isa<ConstantFPSDNode>(Op->getOperand(1)))) 1963 return Op; 1964 1965 APInt E0 = 1966 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 1967 APInt E1 = 1968 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 1969 SDValue Const = 1970 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 1971 return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const); 1972 } 1973 1974 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 1975 SelectionDAG &DAG) const { 1976 SDValue Index = Op->getOperand(1); 1977 // Constant index will be matched by tablegen. 1978 if (isa<ConstantSDNode>(Index.getNode())) 1979 return Op; 1980 1981 // Extract individual elements and select one of them. 1982 SDValue Vector = Op->getOperand(0); 1983 EVT VectorVT = Vector.getValueType(); 1984 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 1985 EVT EltVT = VectorVT.getVectorElementType(); 1986 1987 SDLoc dl(Op.getNode()); 1988 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1989 DAG.getIntPtrConstant(0, dl)); 1990 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 1991 DAG.getIntPtrConstant(1, dl)); 1992 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 1993 ISD::CondCode::SETEQ); 1994 } 1995 1996 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 1997 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 1998 /// amount, or 1999 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2000 /// amount. 2001 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2002 SelectionDAG &DAG) const { 2003 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2004 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2005 2006 EVT VT = Op.getValueType(); 2007 unsigned VTBits = VT.getSizeInBits(); 2008 SDLoc dl(Op); 2009 SDValue ShOpLo = Op.getOperand(0); 2010 SDValue ShOpHi = Op.getOperand(1); 2011 SDValue ShAmt = Op.getOperand(2); 2012 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2013 2014 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2015 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2016 // {dHi, dLo} = {aHi, aLo} >> Amt 2017 // dHi = aHi >> Amt 2018 // dLo = shf.r.clamp aLo, aHi, Amt 2019 2020 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2021 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2022 ShAmt); 2023 2024 SDValue Ops[2] = { Lo, Hi }; 2025 return DAG.getMergeValues(Ops, dl); 2026 } 2027 else { 2028 // {dHi, dLo} = {aHi, aLo} >> Amt 2029 // - if (Amt>=size) then 2030 // dLo = aHi >> (Amt-size) 2031 // dHi = aHi >> Amt (this is either all 0 or all 1) 2032 // else 2033 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2034 // dHi = aHi >> Amt 2035 2036 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2037 DAG.getConstant(VTBits, dl, MVT::i32), 2038 ShAmt); 2039 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2040 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2041 DAG.getConstant(VTBits, dl, MVT::i32)); 2042 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2043 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2044 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2045 2046 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2047 DAG.getConstant(VTBits, dl, MVT::i32), 2048 ISD::SETGE); 2049 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2050 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2051 2052 SDValue Ops[2] = { Lo, Hi }; 2053 return DAG.getMergeValues(Ops, dl); 2054 } 2055 } 2056 2057 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2058 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2059 /// amount, or 2060 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2061 /// amount. 2062 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2063 SelectionDAG &DAG) const { 2064 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2065 assert(Op.getOpcode() == ISD::SHL_PARTS); 2066 2067 EVT VT = Op.getValueType(); 2068 unsigned VTBits = VT.getSizeInBits(); 2069 SDLoc dl(Op); 2070 SDValue ShOpLo = Op.getOperand(0); 2071 SDValue ShOpHi = Op.getOperand(1); 2072 SDValue ShAmt = Op.getOperand(2); 2073 2074 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2075 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2076 // {dHi, dLo} = {aHi, aLo} << Amt 2077 // dHi = shf.l.clamp aLo, aHi, Amt 2078 // dLo = aLo << Amt 2079 2080 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2081 ShAmt); 2082 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2083 2084 SDValue Ops[2] = { Lo, Hi }; 2085 return DAG.getMergeValues(Ops, dl); 2086 } 2087 else { 2088 // {dHi, dLo} = {aHi, aLo} << Amt 2089 // - if (Amt>=size) then 2090 // dLo = aLo << Amt (all 0) 2091 // dLo = aLo << (Amt-size) 2092 // else 2093 // dLo = aLo << Amt 2094 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2095 2096 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2097 DAG.getConstant(VTBits, dl, MVT::i32), 2098 ShAmt); 2099 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2100 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2101 DAG.getConstant(VTBits, dl, MVT::i32)); 2102 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2103 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2104 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2105 2106 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2107 DAG.getConstant(VTBits, dl, MVT::i32), 2108 ISD::SETGE); 2109 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2110 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2111 2112 SDValue Ops[2] = { Lo, Hi }; 2113 return DAG.getMergeValues(Ops, dl); 2114 } 2115 } 2116 2117 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2118 EVT VT = Op.getValueType(); 2119 2120 if (VT == MVT::f32) 2121 return LowerFROUND32(Op, DAG); 2122 2123 if (VT == MVT::f64) 2124 return LowerFROUND64(Op, DAG); 2125 2126 llvm_unreachable("unhandled type"); 2127 } 2128 2129 // This is the the rounding method used in CUDA libdevice in C like code: 2130 // float roundf(float A) 2131 // { 2132 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2133 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2134 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2135 // } 2136 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2137 SelectionDAG &DAG) const { 2138 SDLoc SL(Op); 2139 SDValue A = Op.getOperand(0); 2140 EVT VT = Op.getValueType(); 2141 2142 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2143 2144 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2145 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2146 const int SignBitMask = 0x80000000; 2147 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2148 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2149 const int PointFiveInBits = 0x3F000000; 2150 SDValue PointFiveWithSignRaw = 2151 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2152 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2153 SDValue PointFiveWithSign = 2154 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2155 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2156 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2157 2158 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2159 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2160 SDValue IsLarge = 2161 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2162 ISD::SETOGT); 2163 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2164 2165 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2166 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2167 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2168 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2169 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2170 } 2171 2172 // The implementation of round(double) is similar to that of round(float) in 2173 // that they both separate the value range into three regions and use a method 2174 // specific to the region to round the values. However, round(double) first 2175 // calculates the round of the absolute value and then adds the sign back while 2176 // round(float) directly rounds the value with sign. 2177 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2178 SelectionDAG &DAG) const { 2179 SDLoc SL(Op); 2180 SDValue A = Op.getOperand(0); 2181 EVT VT = Op.getValueType(); 2182 2183 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2184 2185 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2186 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2187 DAG.getConstantFP(0.5, SL, VT)); 2188 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2189 2190 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2191 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2192 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2193 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2194 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2195 DAG.getConstantFP(0, SL, VT), 2196 RoundedA); 2197 2198 // Add sign to rounded_A 2199 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2200 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2201 2202 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2203 SDValue IsLarge = 2204 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2205 ISD::SETOGT); 2206 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2207 } 2208 2209 2210 2211 SDValue 2212 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2213 switch (Op.getOpcode()) { 2214 case ISD::RETURNADDR: 2215 return SDValue(); 2216 case ISD::FRAMEADDR: 2217 return SDValue(); 2218 case ISD::GlobalAddress: 2219 return LowerGlobalAddress(Op, DAG); 2220 case ISD::INTRINSIC_W_CHAIN: 2221 return Op; 2222 case ISD::BUILD_VECTOR: 2223 return LowerBUILD_VECTOR(Op, DAG); 2224 case ISD::EXTRACT_SUBVECTOR: 2225 return Op; 2226 case ISD::EXTRACT_VECTOR_ELT: 2227 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2228 case ISD::CONCAT_VECTORS: 2229 return LowerCONCAT_VECTORS(Op, DAG); 2230 case ISD::STORE: 2231 return LowerSTORE(Op, DAG); 2232 case ISD::LOAD: 2233 return LowerLOAD(Op, DAG); 2234 case ISD::SHL_PARTS: 2235 return LowerShiftLeftParts(Op, DAG); 2236 case ISD::SRA_PARTS: 2237 case ISD::SRL_PARTS: 2238 return LowerShiftRightParts(Op, DAG); 2239 case ISD::SELECT: 2240 return LowerSelect(Op, DAG); 2241 case ISD::FROUND: 2242 return LowerFROUND(Op, DAG); 2243 default: 2244 llvm_unreachable("Custom lowering not defined for operation"); 2245 } 2246 } 2247 2248 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2249 SDValue Op0 = Op->getOperand(0); 2250 SDValue Op1 = Op->getOperand(1); 2251 SDValue Op2 = Op->getOperand(2); 2252 SDLoc DL(Op.getNode()); 2253 2254 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2255 2256 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2257 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2258 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2259 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2260 2261 return Trunc; 2262 } 2263 2264 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2265 if (Op.getValueType() == MVT::i1) 2266 return LowerLOADi1(Op, DAG); 2267 2268 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2269 // loads and have to handle it here. 2270 if (Op.getValueType() == MVT::v2f16) { 2271 LoadSDNode *Load = cast<LoadSDNode>(Op); 2272 EVT MemVT = Load->getMemoryVT(); 2273 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2274 MemVT, *Load->getMemOperand())) { 2275 SDValue Ops[2]; 2276 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2277 return DAG.getMergeValues(Ops, SDLoc(Op)); 2278 } 2279 } 2280 2281 return SDValue(); 2282 } 2283 2284 // v = ld i1* addr 2285 // => 2286 // v1 = ld i8* addr (-> i16) 2287 // v = trunc i16 to i1 2288 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2289 SDNode *Node = Op.getNode(); 2290 LoadSDNode *LD = cast<LoadSDNode>(Node); 2291 SDLoc dl(Node); 2292 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2293 assert(Node->getValueType(0) == MVT::i1 && 2294 "Custom lowering for i1 load only"); 2295 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2296 LD->getPointerInfo(), LD->getAlign(), 2297 LD->getMemOperand()->getFlags()); 2298 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2299 // The legalizer (the caller) is expecting two values from the legalized 2300 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2301 // in LegalizeDAG.cpp which also uses MergeValues. 2302 SDValue Ops[] = { result, LD->getChain() }; 2303 return DAG.getMergeValues(Ops, dl); 2304 } 2305 2306 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2307 StoreSDNode *Store = cast<StoreSDNode>(Op); 2308 EVT VT = Store->getMemoryVT(); 2309 2310 if (VT == MVT::i1) 2311 return LowerSTOREi1(Op, DAG); 2312 2313 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2314 // stores and have to handle it here. 2315 if (VT == MVT::v2f16 && 2316 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2317 VT, *Store->getMemOperand())) 2318 return expandUnalignedStore(Store, DAG); 2319 2320 if (VT.isVector()) 2321 return LowerSTOREVector(Op, DAG); 2322 2323 return SDValue(); 2324 } 2325 2326 SDValue 2327 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2328 SDNode *N = Op.getNode(); 2329 SDValue Val = N->getOperand(1); 2330 SDLoc DL(N); 2331 EVT ValVT = Val.getValueType(); 2332 2333 if (ValVT.isVector()) { 2334 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2335 // legal. We can (and should) split that into 2 stores of <2 x double> here 2336 // but I'm leaving that as a TODO for now. 2337 if (!ValVT.isSimple()) 2338 return SDValue(); 2339 switch (ValVT.getSimpleVT().SimpleTy) { 2340 default: 2341 return SDValue(); 2342 case MVT::v2i8: 2343 case MVT::v2i16: 2344 case MVT::v2i32: 2345 case MVT::v2i64: 2346 case MVT::v2f16: 2347 case MVT::v2f32: 2348 case MVT::v2f64: 2349 case MVT::v4i8: 2350 case MVT::v4i16: 2351 case MVT::v4i32: 2352 case MVT::v4f16: 2353 case MVT::v4f32: 2354 case MVT::v8f16: // <4 x f16x2> 2355 // This is a "native" vector type 2356 break; 2357 } 2358 2359 MemSDNode *MemSD = cast<MemSDNode>(N); 2360 const DataLayout &TD = DAG.getDataLayout(); 2361 2362 Align Alignment = MemSD->getAlign(); 2363 Align PrefAlign = 2364 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2365 if (Alignment < PrefAlign) { 2366 // This store is not sufficiently aligned, so bail out and let this vector 2367 // store be scalarized. Note that we may still be able to emit smaller 2368 // vector stores. For example, if we are storing a <4 x float> with an 2369 // alignment of 8, this check will fail but the legalizer will try again 2370 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2371 return SDValue(); 2372 } 2373 2374 unsigned Opcode = 0; 2375 EVT EltVT = ValVT.getVectorElementType(); 2376 unsigned NumElts = ValVT.getVectorNumElements(); 2377 2378 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2379 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2380 // stored type to i16 and propagate the "real" type as the memory type. 2381 bool NeedExt = false; 2382 if (EltVT.getSizeInBits() < 16) 2383 NeedExt = true; 2384 2385 bool StoreF16x2 = false; 2386 switch (NumElts) { 2387 default: 2388 return SDValue(); 2389 case 2: 2390 Opcode = NVPTXISD::StoreV2; 2391 break; 2392 case 4: 2393 Opcode = NVPTXISD::StoreV4; 2394 break; 2395 case 8: 2396 // v8f16 is a special case. PTX doesn't have st.v8.f16 2397 // instruction. Instead, we split the vector into v2f16 chunks and 2398 // store them with st.v4.b32. 2399 assert(EltVT == MVT::f16 && "Wrong type for the vector."); 2400 Opcode = NVPTXISD::StoreV4; 2401 StoreF16x2 = true; 2402 break; 2403 } 2404 2405 SmallVector<SDValue, 8> Ops; 2406 2407 // First is the chain 2408 Ops.push_back(N->getOperand(0)); 2409 2410 if (StoreF16x2) { 2411 // Combine f16,f16 -> v2f16 2412 NumElts /= 2; 2413 for (unsigned i = 0; i < NumElts; ++i) { 2414 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2415 DAG.getIntPtrConstant(i * 2, DL)); 2416 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val, 2417 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2418 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1); 2419 Ops.push_back(V2); 2420 } 2421 } else { 2422 // Then the split values 2423 for (unsigned i = 0; i < NumElts; ++i) { 2424 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2425 DAG.getIntPtrConstant(i, DL)); 2426 if (NeedExt) 2427 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2428 Ops.push_back(ExtVal); 2429 } 2430 } 2431 2432 // Then any remaining arguments 2433 Ops.append(N->op_begin() + 2, N->op_end()); 2434 2435 SDValue NewSt = 2436 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2437 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2438 2439 // return DCI.CombineTo(N, NewSt, true); 2440 return NewSt; 2441 } 2442 2443 return SDValue(); 2444 } 2445 2446 // st i1 v, addr 2447 // => 2448 // v1 = zxt v to i16 2449 // st.u8 i16, addr 2450 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2451 SDNode *Node = Op.getNode(); 2452 SDLoc dl(Node); 2453 StoreSDNode *ST = cast<StoreSDNode>(Node); 2454 SDValue Tmp1 = ST->getChain(); 2455 SDValue Tmp2 = ST->getBasePtr(); 2456 SDValue Tmp3 = ST->getValue(); 2457 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2458 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2459 SDValue Result = 2460 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2461 ST->getAlign(), ST->getMemOperand()->getFlags()); 2462 return Result; 2463 } 2464 2465 SDValue 2466 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { 2467 std::string ParamSym; 2468 raw_string_ostream ParamStr(ParamSym); 2469 2470 ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx; 2471 ParamStr.flush(); 2472 2473 std::string *SavedStr = 2474 nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str()); 2475 return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); 2476 } 2477 2478 SDValue NVPTXTargetLowering::LowerFormalArguments( 2479 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2480 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2481 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2482 MachineFunction &MF = DAG.getMachineFunction(); 2483 const DataLayout &DL = DAG.getDataLayout(); 2484 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2485 2486 const Function *F = &MF.getFunction(); 2487 const AttributeList &PAL = F->getAttributes(); 2488 const TargetLowering *TLI = STI.getTargetLowering(); 2489 2490 SDValue Root = DAG.getRoot(); 2491 std::vector<SDValue> OutChains; 2492 2493 bool isABI = (STI.getSmVersion() >= 20); 2494 assert(isABI && "Non-ABI compilation is not supported"); 2495 if (!isABI) 2496 return Chain; 2497 2498 std::vector<Type *> argTypes; 2499 std::vector<const Argument *> theArgs; 2500 for (const Argument &I : F->args()) { 2501 theArgs.push_back(&I); 2502 argTypes.push_back(I.getType()); 2503 } 2504 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2505 // Ins.size() will be larger 2506 // * if there is an aggregate argument with multiple fields (each field 2507 // showing up separately in Ins) 2508 // * if there is a vector argument with more than typical vector-length 2509 // elements (generally if more than 4) where each vector element is 2510 // individually present in Ins. 2511 // So a different index should be used for indexing into Ins. 2512 // See similar issue in LowerCall. 2513 unsigned InsIdx = 0; 2514 2515 int idx = 0; 2516 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2517 Type *Ty = argTypes[i]; 2518 2519 if (theArgs[i]->use_empty()) { 2520 // argument is dead 2521 if (Ty->isAggregateType() || Ty->isIntegerTy(128)) { 2522 SmallVector<EVT, 16> vtparts; 2523 2524 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2525 assert(vtparts.size() > 0 && "empty aggregate type not expected"); 2526 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2527 ++parti) { 2528 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2529 ++InsIdx; 2530 } 2531 if (vtparts.size() > 0) 2532 --InsIdx; 2533 continue; 2534 } 2535 if (Ty->isVectorTy()) { 2536 EVT ObjectVT = getValueType(DL, Ty); 2537 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2538 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2539 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2540 ++InsIdx; 2541 } 2542 if (NumRegs > 0) 2543 --InsIdx; 2544 continue; 2545 } 2546 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2547 continue; 2548 } 2549 2550 // In the following cases, assign a node order of "idx+1" 2551 // to newly created nodes. The SDNodes for params have to 2552 // appear in the same order as their order of appearance 2553 // in the original function. "idx+1" holds that order. 2554 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 2555 bool aggregateIsPacked = false; 2556 if (StructType *STy = dyn_cast<StructType>(Ty)) 2557 aggregateIsPacked = STy->isPacked(); 2558 2559 SmallVector<EVT, 16> VTs; 2560 SmallVector<uint64_t, 16> Offsets; 2561 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2562 assert(VTs.size() > 0 && "Unexpected empty type."); 2563 auto VectorInfo = 2564 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 2565 2566 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2567 int VecIdx = -1; // Index of the first element of the current vector. 2568 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2569 if (VectorInfo[parti] & PVF_FIRST) { 2570 assert(VecIdx == -1 && "Orphaned vector."); 2571 VecIdx = parti; 2572 } 2573 2574 // That's the last element of this store op. 2575 if (VectorInfo[parti] & PVF_LAST) { 2576 unsigned NumElts = parti - VecIdx + 1; 2577 EVT EltVT = VTs[parti]; 2578 // i1 is loaded/stored as i8. 2579 EVT LoadVT = EltVT; 2580 if (EltVT == MVT::i1) 2581 LoadVT = MVT::i8; 2582 else if (EltVT == MVT::v2f16) 2583 // getLoad needs a vector type, but it can't handle 2584 // vectors which contain v2f16 elements. So we must load 2585 // using i32 here and then bitcast back. 2586 LoadVT = MVT::i32; 2587 2588 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2589 SDValue VecAddr = 2590 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2591 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2592 Value *srcValue = Constant::getNullValue(PointerType::get( 2593 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2594 SDValue P = 2595 DAG.getLoad(VecVT, dl, Root, VecAddr, 2596 MachinePointerInfo(srcValue), aggregateIsPacked, 2597 MachineMemOperand::MODereferenceable | 2598 MachineMemOperand::MOInvariant); 2599 if (P.getNode()) 2600 P.getNode()->setIROrder(idx + 1); 2601 for (unsigned j = 0; j < NumElts; ++j) { 2602 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2603 DAG.getIntPtrConstant(j, dl)); 2604 // We've loaded i1 as an i8 and now must truncate it back to i1 2605 if (EltVT == MVT::i1) 2606 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2607 // v2f16 was loaded as an i32. Now we must bitcast it back. 2608 else if (EltVT == MVT::v2f16) 2609 Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt); 2610 2611 // If a promoted integer type is used, truncate down to the original 2612 MVT PromotedVT; 2613 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 2614 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 2615 } 2616 2617 // Extend the element if necessary (e.g. an i8 is loaded 2618 // into an i16 register) 2619 if (Ins[InsIdx].VT.isInteger() && 2620 Ins[InsIdx].VT.getFixedSizeInBits() > 2621 LoadVT.getFixedSizeInBits()) { 2622 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2623 : ISD::ZERO_EXTEND; 2624 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2625 } 2626 InVals.push_back(Elt); 2627 } 2628 2629 // Reset vector tracking state. 2630 VecIdx = -1; 2631 } 2632 ++InsIdx; 2633 } 2634 if (VTs.size() > 0) 2635 --InsIdx; 2636 continue; 2637 } 2638 2639 // Param has ByVal attribute 2640 // Return MoveParam(param symbol). 2641 // Ideally, the param symbol can be returned directly, 2642 // but when SDNode builder decides to use it in a CopyToReg(), 2643 // machine instruction fails because TargetExternalSymbol 2644 // (not lowered) is target dependent, and CopyToReg assumes 2645 // the source is lowered. 2646 EVT ObjectVT = getValueType(DL, Ty); 2647 assert(ObjectVT == Ins[InsIdx].VT && 2648 "Ins type did not match function type"); 2649 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2650 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2651 if (p.getNode()) 2652 p.getNode()->setIROrder(idx + 1); 2653 InVals.push_back(p); 2654 } 2655 2656 // Clang will check explicit VarArg and issue error if any. However, Clang 2657 // will let code with 2658 // implicit var arg like f() pass. See bug 617733. 2659 // We treat this case as if the arg list is empty. 2660 // if (F.isVarArg()) { 2661 // assert(0 && "VarArg not supported yet!"); 2662 //} 2663 2664 if (!OutChains.empty()) 2665 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2666 2667 return Chain; 2668 } 2669 2670 SDValue 2671 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2672 bool isVarArg, 2673 const SmallVectorImpl<ISD::OutputArg> &Outs, 2674 const SmallVectorImpl<SDValue> &OutVals, 2675 const SDLoc &dl, SelectionDAG &DAG) const { 2676 const MachineFunction &MF = DAG.getMachineFunction(); 2677 const Function &F = MF.getFunction(); 2678 Type *RetTy = MF.getFunction().getReturnType(); 2679 2680 bool isABI = (STI.getSmVersion() >= 20); 2681 assert(isABI && "Non-ABI compilation is not supported"); 2682 if (!isABI) 2683 return Chain; 2684 2685 const DataLayout &DL = DAG.getDataLayout(); 2686 SmallVector<SDValue, 16> PromotedOutVals; 2687 SmallVector<EVT, 16> VTs; 2688 SmallVector<uint64_t, 16> Offsets; 2689 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2690 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2691 2692 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2693 SDValue PromotedOutVal = OutVals[i]; 2694 MVT PromotedVT; 2695 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 2696 VTs[i] = EVT(PromotedVT); 2697 } 2698 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 2699 llvm::ISD::NodeType Ext = 2700 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2701 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 2702 } 2703 PromotedOutVals.push_back(PromotedOutVal); 2704 } 2705 2706 auto VectorInfo = VectorizePTXValueVTs( 2707 VTs, Offsets, 2708 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 2709 : Align(1)); 2710 2711 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2712 // 32-bits are sign extended or zero extended, depending on whether 2713 // they are signed or unsigned types. 2714 bool ExtendIntegerRetVal = 2715 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2716 2717 SmallVector<SDValue, 6> StoreOperands; 2718 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2719 // New load/store. Record chain and offset operands. 2720 if (VectorInfo[i] & PVF_FIRST) { 2721 assert(StoreOperands.empty() && "Orphaned operand list."); 2722 StoreOperands.push_back(Chain); 2723 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2724 } 2725 2726 SDValue OutVal = OutVals[i]; 2727 SDValue RetVal = PromotedOutVals[i]; 2728 2729 if (ExtendIntegerRetVal) { 2730 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2731 : ISD::ZERO_EXTEND, 2732 dl, MVT::i32, RetVal); 2733 } else if (OutVal.getValueSizeInBits() < 16) { 2734 // Use 16-bit registers for small load-stores as it's the 2735 // smallest general purpose register size supported by NVPTX. 2736 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2737 } 2738 2739 // Record the value to return. 2740 StoreOperands.push_back(RetVal); 2741 2742 // That's the last element of this store op. 2743 if (VectorInfo[i] & PVF_LAST) { 2744 NVPTXISD::NodeType Op; 2745 unsigned NumElts = StoreOperands.size() - 2; 2746 switch (NumElts) { 2747 case 1: 2748 Op = NVPTXISD::StoreRetval; 2749 break; 2750 case 2: 2751 Op = NVPTXISD::StoreRetvalV2; 2752 break; 2753 case 4: 2754 Op = NVPTXISD::StoreRetvalV4; 2755 break; 2756 default: 2757 llvm_unreachable("Invalid vector info."); 2758 } 2759 2760 // Adjust type of load/store op if we've extended the scalar 2761 // return value. 2762 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2763 Chain = DAG.getMemIntrinsicNode( 2764 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 2765 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 2766 // Cleanup vector state. 2767 StoreOperands.clear(); 2768 } 2769 } 2770 2771 return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); 2772 } 2773 2774 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2775 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2776 SelectionDAG &DAG) const { 2777 if (Constraint.length() > 1) 2778 return; 2779 else 2780 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2781 } 2782 2783 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2784 switch (Intrinsic) { 2785 default: 2786 return 0; 2787 2788 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2789 return NVPTXISD::Tex1DFloatS32; 2790 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2791 return NVPTXISD::Tex1DFloatFloat; 2792 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2793 return NVPTXISD::Tex1DFloatFloatLevel; 2794 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 2795 return NVPTXISD::Tex1DFloatFloatGrad; 2796 case Intrinsic::nvvm_tex_1d_v4s32_s32: 2797 return NVPTXISD::Tex1DS32S32; 2798 case Intrinsic::nvvm_tex_1d_v4s32_f32: 2799 return NVPTXISD::Tex1DS32Float; 2800 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 2801 return NVPTXISD::Tex1DS32FloatLevel; 2802 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 2803 return NVPTXISD::Tex1DS32FloatGrad; 2804 case Intrinsic::nvvm_tex_1d_v4u32_s32: 2805 return NVPTXISD::Tex1DU32S32; 2806 case Intrinsic::nvvm_tex_1d_v4u32_f32: 2807 return NVPTXISD::Tex1DU32Float; 2808 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 2809 return NVPTXISD::Tex1DU32FloatLevel; 2810 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 2811 return NVPTXISD::Tex1DU32FloatGrad; 2812 2813 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 2814 return NVPTXISD::Tex1DArrayFloatS32; 2815 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 2816 return NVPTXISD::Tex1DArrayFloatFloat; 2817 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 2818 return NVPTXISD::Tex1DArrayFloatFloatLevel; 2819 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 2820 return NVPTXISD::Tex1DArrayFloatFloatGrad; 2821 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 2822 return NVPTXISD::Tex1DArrayS32S32; 2823 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 2824 return NVPTXISD::Tex1DArrayS32Float; 2825 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 2826 return NVPTXISD::Tex1DArrayS32FloatLevel; 2827 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 2828 return NVPTXISD::Tex1DArrayS32FloatGrad; 2829 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 2830 return NVPTXISD::Tex1DArrayU32S32; 2831 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 2832 return NVPTXISD::Tex1DArrayU32Float; 2833 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 2834 return NVPTXISD::Tex1DArrayU32FloatLevel; 2835 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 2836 return NVPTXISD::Tex1DArrayU32FloatGrad; 2837 2838 case Intrinsic::nvvm_tex_2d_v4f32_s32: 2839 return NVPTXISD::Tex2DFloatS32; 2840 case Intrinsic::nvvm_tex_2d_v4f32_f32: 2841 return NVPTXISD::Tex2DFloatFloat; 2842 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 2843 return NVPTXISD::Tex2DFloatFloatLevel; 2844 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 2845 return NVPTXISD::Tex2DFloatFloatGrad; 2846 case Intrinsic::nvvm_tex_2d_v4s32_s32: 2847 return NVPTXISD::Tex2DS32S32; 2848 case Intrinsic::nvvm_tex_2d_v4s32_f32: 2849 return NVPTXISD::Tex2DS32Float; 2850 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 2851 return NVPTXISD::Tex2DS32FloatLevel; 2852 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 2853 return NVPTXISD::Tex2DS32FloatGrad; 2854 case Intrinsic::nvvm_tex_2d_v4u32_s32: 2855 return NVPTXISD::Tex2DU32S32; 2856 case Intrinsic::nvvm_tex_2d_v4u32_f32: 2857 return NVPTXISD::Tex2DU32Float; 2858 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 2859 return NVPTXISD::Tex2DU32FloatLevel; 2860 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 2861 return NVPTXISD::Tex2DU32FloatGrad; 2862 2863 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 2864 return NVPTXISD::Tex2DArrayFloatS32; 2865 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 2866 return NVPTXISD::Tex2DArrayFloatFloat; 2867 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 2868 return NVPTXISD::Tex2DArrayFloatFloatLevel; 2869 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 2870 return NVPTXISD::Tex2DArrayFloatFloatGrad; 2871 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 2872 return NVPTXISD::Tex2DArrayS32S32; 2873 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 2874 return NVPTXISD::Tex2DArrayS32Float; 2875 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 2876 return NVPTXISD::Tex2DArrayS32FloatLevel; 2877 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 2878 return NVPTXISD::Tex2DArrayS32FloatGrad; 2879 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 2880 return NVPTXISD::Tex2DArrayU32S32; 2881 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 2882 return NVPTXISD::Tex2DArrayU32Float; 2883 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 2884 return NVPTXISD::Tex2DArrayU32FloatLevel; 2885 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 2886 return NVPTXISD::Tex2DArrayU32FloatGrad; 2887 2888 case Intrinsic::nvvm_tex_3d_v4f32_s32: 2889 return NVPTXISD::Tex3DFloatS32; 2890 case Intrinsic::nvvm_tex_3d_v4f32_f32: 2891 return NVPTXISD::Tex3DFloatFloat; 2892 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 2893 return NVPTXISD::Tex3DFloatFloatLevel; 2894 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 2895 return NVPTXISD::Tex3DFloatFloatGrad; 2896 case Intrinsic::nvvm_tex_3d_v4s32_s32: 2897 return NVPTXISD::Tex3DS32S32; 2898 case Intrinsic::nvvm_tex_3d_v4s32_f32: 2899 return NVPTXISD::Tex3DS32Float; 2900 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 2901 return NVPTXISD::Tex3DS32FloatLevel; 2902 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 2903 return NVPTXISD::Tex3DS32FloatGrad; 2904 case Intrinsic::nvvm_tex_3d_v4u32_s32: 2905 return NVPTXISD::Tex3DU32S32; 2906 case Intrinsic::nvvm_tex_3d_v4u32_f32: 2907 return NVPTXISD::Tex3DU32Float; 2908 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 2909 return NVPTXISD::Tex3DU32FloatLevel; 2910 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 2911 return NVPTXISD::Tex3DU32FloatGrad; 2912 2913 case Intrinsic::nvvm_tex_cube_v4f32_f32: 2914 return NVPTXISD::TexCubeFloatFloat; 2915 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 2916 return NVPTXISD::TexCubeFloatFloatLevel; 2917 case Intrinsic::nvvm_tex_cube_v4s32_f32: 2918 return NVPTXISD::TexCubeS32Float; 2919 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 2920 return NVPTXISD::TexCubeS32FloatLevel; 2921 case Intrinsic::nvvm_tex_cube_v4u32_f32: 2922 return NVPTXISD::TexCubeU32Float; 2923 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 2924 return NVPTXISD::TexCubeU32FloatLevel; 2925 2926 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 2927 return NVPTXISD::TexCubeArrayFloatFloat; 2928 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 2929 return NVPTXISD::TexCubeArrayFloatFloatLevel; 2930 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 2931 return NVPTXISD::TexCubeArrayS32Float; 2932 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 2933 return NVPTXISD::TexCubeArrayS32FloatLevel; 2934 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 2935 return NVPTXISD::TexCubeArrayU32Float; 2936 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 2937 return NVPTXISD::TexCubeArrayU32FloatLevel; 2938 2939 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 2940 return NVPTXISD::Tld4R2DFloatFloat; 2941 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 2942 return NVPTXISD::Tld4G2DFloatFloat; 2943 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 2944 return NVPTXISD::Tld4B2DFloatFloat; 2945 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 2946 return NVPTXISD::Tld4A2DFloatFloat; 2947 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 2948 return NVPTXISD::Tld4R2DS64Float; 2949 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 2950 return NVPTXISD::Tld4G2DS64Float; 2951 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 2952 return NVPTXISD::Tld4B2DS64Float; 2953 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 2954 return NVPTXISD::Tld4A2DS64Float; 2955 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 2956 return NVPTXISD::Tld4R2DU64Float; 2957 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 2958 return NVPTXISD::Tld4G2DU64Float; 2959 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 2960 return NVPTXISD::Tld4B2DU64Float; 2961 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 2962 return NVPTXISD::Tld4A2DU64Float; 2963 2964 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 2965 return NVPTXISD::TexUnified1DFloatS32; 2966 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 2967 return NVPTXISD::TexUnified1DFloatFloat; 2968 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 2969 return NVPTXISD::TexUnified1DFloatFloatLevel; 2970 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 2971 return NVPTXISD::TexUnified1DFloatFloatGrad; 2972 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 2973 return NVPTXISD::TexUnified1DS32S32; 2974 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 2975 return NVPTXISD::TexUnified1DS32Float; 2976 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 2977 return NVPTXISD::TexUnified1DS32FloatLevel; 2978 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 2979 return NVPTXISD::TexUnified1DS32FloatGrad; 2980 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 2981 return NVPTXISD::TexUnified1DU32S32; 2982 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 2983 return NVPTXISD::TexUnified1DU32Float; 2984 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 2985 return NVPTXISD::TexUnified1DU32FloatLevel; 2986 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 2987 return NVPTXISD::TexUnified1DU32FloatGrad; 2988 2989 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 2990 return NVPTXISD::TexUnified1DArrayFloatS32; 2991 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 2992 return NVPTXISD::TexUnified1DArrayFloatFloat; 2993 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 2994 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 2995 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 2996 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 2997 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 2998 return NVPTXISD::TexUnified1DArrayS32S32; 2999 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3000 return NVPTXISD::TexUnified1DArrayS32Float; 3001 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3002 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3003 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3004 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3005 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3006 return NVPTXISD::TexUnified1DArrayU32S32; 3007 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3008 return NVPTXISD::TexUnified1DArrayU32Float; 3009 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3010 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3011 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3012 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3013 3014 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3015 return NVPTXISD::TexUnified2DFloatS32; 3016 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3017 return NVPTXISD::TexUnified2DFloatFloat; 3018 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3019 return NVPTXISD::TexUnified2DFloatFloatLevel; 3020 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3021 return NVPTXISD::TexUnified2DFloatFloatGrad; 3022 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3023 return NVPTXISD::TexUnified2DS32S32; 3024 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3025 return NVPTXISD::TexUnified2DS32Float; 3026 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3027 return NVPTXISD::TexUnified2DS32FloatLevel; 3028 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3029 return NVPTXISD::TexUnified2DS32FloatGrad; 3030 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3031 return NVPTXISD::TexUnified2DU32S32; 3032 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3033 return NVPTXISD::TexUnified2DU32Float; 3034 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3035 return NVPTXISD::TexUnified2DU32FloatLevel; 3036 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3037 return NVPTXISD::TexUnified2DU32FloatGrad; 3038 3039 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3040 return NVPTXISD::TexUnified2DArrayFloatS32; 3041 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3042 return NVPTXISD::TexUnified2DArrayFloatFloat; 3043 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3044 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3045 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3046 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3047 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3048 return NVPTXISD::TexUnified2DArrayS32S32; 3049 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3050 return NVPTXISD::TexUnified2DArrayS32Float; 3051 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3052 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3053 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3054 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3055 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3056 return NVPTXISD::TexUnified2DArrayU32S32; 3057 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3058 return NVPTXISD::TexUnified2DArrayU32Float; 3059 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3060 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3061 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3062 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3063 3064 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3065 return NVPTXISD::TexUnified3DFloatS32; 3066 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3067 return NVPTXISD::TexUnified3DFloatFloat; 3068 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3069 return NVPTXISD::TexUnified3DFloatFloatLevel; 3070 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3071 return NVPTXISD::TexUnified3DFloatFloatGrad; 3072 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3073 return NVPTXISD::TexUnified3DS32S32; 3074 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3075 return NVPTXISD::TexUnified3DS32Float; 3076 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3077 return NVPTXISD::TexUnified3DS32FloatLevel; 3078 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3079 return NVPTXISD::TexUnified3DS32FloatGrad; 3080 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3081 return NVPTXISD::TexUnified3DU32S32; 3082 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3083 return NVPTXISD::TexUnified3DU32Float; 3084 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3085 return NVPTXISD::TexUnified3DU32FloatLevel; 3086 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3087 return NVPTXISD::TexUnified3DU32FloatGrad; 3088 3089 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3090 return NVPTXISD::TexUnifiedCubeFloatFloat; 3091 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3092 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3093 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3094 return NVPTXISD::TexUnifiedCubeS32Float; 3095 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3096 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3097 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3098 return NVPTXISD::TexUnifiedCubeU32Float; 3099 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3100 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3101 3102 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3103 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3104 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3105 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3106 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3107 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3108 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3109 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3110 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3111 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3112 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3113 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3114 3115 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3116 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3117 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3118 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3119 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3120 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3121 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3122 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3123 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3124 return NVPTXISD::Tld4UnifiedR2DS64Float; 3125 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3126 return NVPTXISD::Tld4UnifiedG2DS64Float; 3127 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3128 return NVPTXISD::Tld4UnifiedB2DS64Float; 3129 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3130 return NVPTXISD::Tld4UnifiedA2DS64Float; 3131 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3132 return NVPTXISD::Tld4UnifiedR2DU64Float; 3133 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3134 return NVPTXISD::Tld4UnifiedG2DU64Float; 3135 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3136 return NVPTXISD::Tld4UnifiedB2DU64Float; 3137 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3138 return NVPTXISD::Tld4UnifiedA2DU64Float; 3139 } 3140 } 3141 3142 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3143 switch (Intrinsic) { 3144 default: 3145 return 0; 3146 case Intrinsic::nvvm_suld_1d_i8_clamp: 3147 return NVPTXISD::Suld1DI8Clamp; 3148 case Intrinsic::nvvm_suld_1d_i16_clamp: 3149 return NVPTXISD::Suld1DI16Clamp; 3150 case Intrinsic::nvvm_suld_1d_i32_clamp: 3151 return NVPTXISD::Suld1DI32Clamp; 3152 case Intrinsic::nvvm_suld_1d_i64_clamp: 3153 return NVPTXISD::Suld1DI64Clamp; 3154 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3155 return NVPTXISD::Suld1DV2I8Clamp; 3156 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3157 return NVPTXISD::Suld1DV2I16Clamp; 3158 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3159 return NVPTXISD::Suld1DV2I32Clamp; 3160 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3161 return NVPTXISD::Suld1DV2I64Clamp; 3162 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3163 return NVPTXISD::Suld1DV4I8Clamp; 3164 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3165 return NVPTXISD::Suld1DV4I16Clamp; 3166 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3167 return NVPTXISD::Suld1DV4I32Clamp; 3168 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3169 return NVPTXISD::Suld1DArrayI8Clamp; 3170 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3171 return NVPTXISD::Suld1DArrayI16Clamp; 3172 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3173 return NVPTXISD::Suld1DArrayI32Clamp; 3174 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3175 return NVPTXISD::Suld1DArrayI64Clamp; 3176 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3177 return NVPTXISD::Suld1DArrayV2I8Clamp; 3178 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3179 return NVPTXISD::Suld1DArrayV2I16Clamp; 3180 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3181 return NVPTXISD::Suld1DArrayV2I32Clamp; 3182 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3183 return NVPTXISD::Suld1DArrayV2I64Clamp; 3184 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3185 return NVPTXISD::Suld1DArrayV4I8Clamp; 3186 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3187 return NVPTXISD::Suld1DArrayV4I16Clamp; 3188 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3189 return NVPTXISD::Suld1DArrayV4I32Clamp; 3190 case Intrinsic::nvvm_suld_2d_i8_clamp: 3191 return NVPTXISD::Suld2DI8Clamp; 3192 case Intrinsic::nvvm_suld_2d_i16_clamp: 3193 return NVPTXISD::Suld2DI16Clamp; 3194 case Intrinsic::nvvm_suld_2d_i32_clamp: 3195 return NVPTXISD::Suld2DI32Clamp; 3196 case Intrinsic::nvvm_suld_2d_i64_clamp: 3197 return NVPTXISD::Suld2DI64Clamp; 3198 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3199 return NVPTXISD::Suld2DV2I8Clamp; 3200 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3201 return NVPTXISD::Suld2DV2I16Clamp; 3202 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3203 return NVPTXISD::Suld2DV2I32Clamp; 3204 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3205 return NVPTXISD::Suld2DV2I64Clamp; 3206 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3207 return NVPTXISD::Suld2DV4I8Clamp; 3208 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3209 return NVPTXISD::Suld2DV4I16Clamp; 3210 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3211 return NVPTXISD::Suld2DV4I32Clamp; 3212 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3213 return NVPTXISD::Suld2DArrayI8Clamp; 3214 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3215 return NVPTXISD::Suld2DArrayI16Clamp; 3216 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3217 return NVPTXISD::Suld2DArrayI32Clamp; 3218 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3219 return NVPTXISD::Suld2DArrayI64Clamp; 3220 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3221 return NVPTXISD::Suld2DArrayV2I8Clamp; 3222 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3223 return NVPTXISD::Suld2DArrayV2I16Clamp; 3224 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3225 return NVPTXISD::Suld2DArrayV2I32Clamp; 3226 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3227 return NVPTXISD::Suld2DArrayV2I64Clamp; 3228 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3229 return NVPTXISD::Suld2DArrayV4I8Clamp; 3230 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3231 return NVPTXISD::Suld2DArrayV4I16Clamp; 3232 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3233 return NVPTXISD::Suld2DArrayV4I32Clamp; 3234 case Intrinsic::nvvm_suld_3d_i8_clamp: 3235 return NVPTXISD::Suld3DI8Clamp; 3236 case Intrinsic::nvvm_suld_3d_i16_clamp: 3237 return NVPTXISD::Suld3DI16Clamp; 3238 case Intrinsic::nvvm_suld_3d_i32_clamp: 3239 return NVPTXISD::Suld3DI32Clamp; 3240 case Intrinsic::nvvm_suld_3d_i64_clamp: 3241 return NVPTXISD::Suld3DI64Clamp; 3242 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3243 return NVPTXISD::Suld3DV2I8Clamp; 3244 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3245 return NVPTXISD::Suld3DV2I16Clamp; 3246 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3247 return NVPTXISD::Suld3DV2I32Clamp; 3248 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3249 return NVPTXISD::Suld3DV2I64Clamp; 3250 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3251 return NVPTXISD::Suld3DV4I8Clamp; 3252 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3253 return NVPTXISD::Suld3DV4I16Clamp; 3254 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3255 return NVPTXISD::Suld3DV4I32Clamp; 3256 case Intrinsic::nvvm_suld_1d_i8_trap: 3257 return NVPTXISD::Suld1DI8Trap; 3258 case Intrinsic::nvvm_suld_1d_i16_trap: 3259 return NVPTXISD::Suld1DI16Trap; 3260 case Intrinsic::nvvm_suld_1d_i32_trap: 3261 return NVPTXISD::Suld1DI32Trap; 3262 case Intrinsic::nvvm_suld_1d_i64_trap: 3263 return NVPTXISD::Suld1DI64Trap; 3264 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3265 return NVPTXISD::Suld1DV2I8Trap; 3266 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3267 return NVPTXISD::Suld1DV2I16Trap; 3268 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3269 return NVPTXISD::Suld1DV2I32Trap; 3270 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3271 return NVPTXISD::Suld1DV2I64Trap; 3272 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3273 return NVPTXISD::Suld1DV4I8Trap; 3274 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3275 return NVPTXISD::Suld1DV4I16Trap; 3276 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3277 return NVPTXISD::Suld1DV4I32Trap; 3278 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3279 return NVPTXISD::Suld1DArrayI8Trap; 3280 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3281 return NVPTXISD::Suld1DArrayI16Trap; 3282 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3283 return NVPTXISD::Suld1DArrayI32Trap; 3284 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3285 return NVPTXISD::Suld1DArrayI64Trap; 3286 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3287 return NVPTXISD::Suld1DArrayV2I8Trap; 3288 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3289 return NVPTXISD::Suld1DArrayV2I16Trap; 3290 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3291 return NVPTXISD::Suld1DArrayV2I32Trap; 3292 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3293 return NVPTXISD::Suld1DArrayV2I64Trap; 3294 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3295 return NVPTXISD::Suld1DArrayV4I8Trap; 3296 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3297 return NVPTXISD::Suld1DArrayV4I16Trap; 3298 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3299 return NVPTXISD::Suld1DArrayV4I32Trap; 3300 case Intrinsic::nvvm_suld_2d_i8_trap: 3301 return NVPTXISD::Suld2DI8Trap; 3302 case Intrinsic::nvvm_suld_2d_i16_trap: 3303 return NVPTXISD::Suld2DI16Trap; 3304 case Intrinsic::nvvm_suld_2d_i32_trap: 3305 return NVPTXISD::Suld2DI32Trap; 3306 case Intrinsic::nvvm_suld_2d_i64_trap: 3307 return NVPTXISD::Suld2DI64Trap; 3308 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3309 return NVPTXISD::Suld2DV2I8Trap; 3310 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3311 return NVPTXISD::Suld2DV2I16Trap; 3312 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3313 return NVPTXISD::Suld2DV2I32Trap; 3314 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3315 return NVPTXISD::Suld2DV2I64Trap; 3316 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3317 return NVPTXISD::Suld2DV4I8Trap; 3318 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3319 return NVPTXISD::Suld2DV4I16Trap; 3320 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3321 return NVPTXISD::Suld2DV4I32Trap; 3322 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3323 return NVPTXISD::Suld2DArrayI8Trap; 3324 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3325 return NVPTXISD::Suld2DArrayI16Trap; 3326 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3327 return NVPTXISD::Suld2DArrayI32Trap; 3328 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3329 return NVPTXISD::Suld2DArrayI64Trap; 3330 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3331 return NVPTXISD::Suld2DArrayV2I8Trap; 3332 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3333 return NVPTXISD::Suld2DArrayV2I16Trap; 3334 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3335 return NVPTXISD::Suld2DArrayV2I32Trap; 3336 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3337 return NVPTXISD::Suld2DArrayV2I64Trap; 3338 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3339 return NVPTXISD::Suld2DArrayV4I8Trap; 3340 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3341 return NVPTXISD::Suld2DArrayV4I16Trap; 3342 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3343 return NVPTXISD::Suld2DArrayV4I32Trap; 3344 case Intrinsic::nvvm_suld_3d_i8_trap: 3345 return NVPTXISD::Suld3DI8Trap; 3346 case Intrinsic::nvvm_suld_3d_i16_trap: 3347 return NVPTXISD::Suld3DI16Trap; 3348 case Intrinsic::nvvm_suld_3d_i32_trap: 3349 return NVPTXISD::Suld3DI32Trap; 3350 case Intrinsic::nvvm_suld_3d_i64_trap: 3351 return NVPTXISD::Suld3DI64Trap; 3352 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3353 return NVPTXISD::Suld3DV2I8Trap; 3354 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3355 return NVPTXISD::Suld3DV2I16Trap; 3356 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3357 return NVPTXISD::Suld3DV2I32Trap; 3358 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3359 return NVPTXISD::Suld3DV2I64Trap; 3360 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3361 return NVPTXISD::Suld3DV4I8Trap; 3362 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3363 return NVPTXISD::Suld3DV4I16Trap; 3364 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3365 return NVPTXISD::Suld3DV4I32Trap; 3366 case Intrinsic::nvvm_suld_1d_i8_zero: 3367 return NVPTXISD::Suld1DI8Zero; 3368 case Intrinsic::nvvm_suld_1d_i16_zero: 3369 return NVPTXISD::Suld1DI16Zero; 3370 case Intrinsic::nvvm_suld_1d_i32_zero: 3371 return NVPTXISD::Suld1DI32Zero; 3372 case Intrinsic::nvvm_suld_1d_i64_zero: 3373 return NVPTXISD::Suld1DI64Zero; 3374 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3375 return NVPTXISD::Suld1DV2I8Zero; 3376 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3377 return NVPTXISD::Suld1DV2I16Zero; 3378 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3379 return NVPTXISD::Suld1DV2I32Zero; 3380 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3381 return NVPTXISD::Suld1DV2I64Zero; 3382 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3383 return NVPTXISD::Suld1DV4I8Zero; 3384 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3385 return NVPTXISD::Suld1DV4I16Zero; 3386 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3387 return NVPTXISD::Suld1DV4I32Zero; 3388 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3389 return NVPTXISD::Suld1DArrayI8Zero; 3390 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3391 return NVPTXISD::Suld1DArrayI16Zero; 3392 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3393 return NVPTXISD::Suld1DArrayI32Zero; 3394 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3395 return NVPTXISD::Suld1DArrayI64Zero; 3396 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3397 return NVPTXISD::Suld1DArrayV2I8Zero; 3398 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3399 return NVPTXISD::Suld1DArrayV2I16Zero; 3400 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3401 return NVPTXISD::Suld1DArrayV2I32Zero; 3402 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3403 return NVPTXISD::Suld1DArrayV2I64Zero; 3404 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3405 return NVPTXISD::Suld1DArrayV4I8Zero; 3406 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3407 return NVPTXISD::Suld1DArrayV4I16Zero; 3408 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3409 return NVPTXISD::Suld1DArrayV4I32Zero; 3410 case Intrinsic::nvvm_suld_2d_i8_zero: 3411 return NVPTXISD::Suld2DI8Zero; 3412 case Intrinsic::nvvm_suld_2d_i16_zero: 3413 return NVPTXISD::Suld2DI16Zero; 3414 case Intrinsic::nvvm_suld_2d_i32_zero: 3415 return NVPTXISD::Suld2DI32Zero; 3416 case Intrinsic::nvvm_suld_2d_i64_zero: 3417 return NVPTXISD::Suld2DI64Zero; 3418 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3419 return NVPTXISD::Suld2DV2I8Zero; 3420 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3421 return NVPTXISD::Suld2DV2I16Zero; 3422 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3423 return NVPTXISD::Suld2DV2I32Zero; 3424 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3425 return NVPTXISD::Suld2DV2I64Zero; 3426 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3427 return NVPTXISD::Suld2DV4I8Zero; 3428 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3429 return NVPTXISD::Suld2DV4I16Zero; 3430 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3431 return NVPTXISD::Suld2DV4I32Zero; 3432 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3433 return NVPTXISD::Suld2DArrayI8Zero; 3434 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3435 return NVPTXISD::Suld2DArrayI16Zero; 3436 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3437 return NVPTXISD::Suld2DArrayI32Zero; 3438 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3439 return NVPTXISD::Suld2DArrayI64Zero; 3440 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3441 return NVPTXISD::Suld2DArrayV2I8Zero; 3442 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3443 return NVPTXISD::Suld2DArrayV2I16Zero; 3444 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3445 return NVPTXISD::Suld2DArrayV2I32Zero; 3446 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3447 return NVPTXISD::Suld2DArrayV2I64Zero; 3448 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3449 return NVPTXISD::Suld2DArrayV4I8Zero; 3450 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3451 return NVPTXISD::Suld2DArrayV4I16Zero; 3452 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3453 return NVPTXISD::Suld2DArrayV4I32Zero; 3454 case Intrinsic::nvvm_suld_3d_i8_zero: 3455 return NVPTXISD::Suld3DI8Zero; 3456 case Intrinsic::nvvm_suld_3d_i16_zero: 3457 return NVPTXISD::Suld3DI16Zero; 3458 case Intrinsic::nvvm_suld_3d_i32_zero: 3459 return NVPTXISD::Suld3DI32Zero; 3460 case Intrinsic::nvvm_suld_3d_i64_zero: 3461 return NVPTXISD::Suld3DI64Zero; 3462 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3463 return NVPTXISD::Suld3DV2I8Zero; 3464 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3465 return NVPTXISD::Suld3DV2I16Zero; 3466 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3467 return NVPTXISD::Suld3DV2I32Zero; 3468 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3469 return NVPTXISD::Suld3DV2I64Zero; 3470 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3471 return NVPTXISD::Suld3DV4I8Zero; 3472 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3473 return NVPTXISD::Suld3DV4I16Zero; 3474 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3475 return NVPTXISD::Suld3DV4I32Zero; 3476 } 3477 } 3478 3479 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3480 // TgtMemIntrinsic 3481 // because we need the information that is only available in the "Value" type 3482 // of destination 3483 // pointer. In particular, the address space information. 3484 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3485 IntrinsicInfo &Info, const CallInst &I, 3486 MachineFunction &MF, unsigned Intrinsic) const { 3487 switch (Intrinsic) { 3488 default: 3489 return false; 3490 case Intrinsic::nvvm_match_all_sync_i32p: 3491 case Intrinsic::nvvm_match_all_sync_i64p: 3492 Info.opc = ISD::INTRINSIC_W_CHAIN; 3493 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3494 // in order to model data exchange with other threads, but perform no real 3495 // memory accesses. 3496 Info.memVT = MVT::i1; 3497 3498 // Our result depends on both our and other thread's arguments. 3499 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3500 return true; 3501 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3502 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3503 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3504 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3505 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3506 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3507 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3508 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3509 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3510 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3511 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3512 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3513 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3514 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3515 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3516 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3517 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3518 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3519 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3520 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3521 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3522 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3523 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3524 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3525 Info.opc = ISD::INTRINSIC_W_CHAIN; 3526 Info.memVT = MVT::v8f16; 3527 Info.ptrVal = I.getArgOperand(0); 3528 Info.offset = 0; 3529 Info.flags = MachineMemOperand::MOLoad; 3530 Info.align = Align(16); 3531 return true; 3532 } 3533 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3534 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3535 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3536 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3537 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3538 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3539 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3540 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3541 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 3542 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 3543 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 3544 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 3545 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3546 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3547 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3548 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3549 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3550 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3551 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3552 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 3553 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 3554 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 3555 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 3556 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 3557 Info.opc = ISD::INTRINSIC_W_CHAIN; 3558 Info.memVT = MVT::v2i32; 3559 Info.ptrVal = I.getArgOperand(0); 3560 Info.offset = 0; 3561 Info.flags = MachineMemOperand::MOLoad; 3562 Info.align = Align(8); 3563 return true; 3564 } 3565 3566 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3567 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3568 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3569 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3570 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3571 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3572 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3573 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3574 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 3575 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 3576 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 3577 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 3578 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 3579 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 3580 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 3581 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 3582 3583 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3584 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3585 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3586 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3587 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3588 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3589 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3590 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 3591 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 3592 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 3593 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 3594 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 3595 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 3596 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 3597 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 3598 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 3599 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 3600 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 3601 Info.opc = ISD::INTRINSIC_W_CHAIN; 3602 Info.memVT = MVT::v4i32; 3603 Info.ptrVal = I.getArgOperand(0); 3604 Info.offset = 0; 3605 Info.flags = MachineMemOperand::MOLoad; 3606 Info.align = Align(16); 3607 return true; 3608 } 3609 3610 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3611 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3612 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3613 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3614 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3615 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3616 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3617 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3618 3619 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3620 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3621 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3622 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3623 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3624 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3625 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3626 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3627 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3628 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3629 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3630 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3631 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3632 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3633 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3634 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3635 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3636 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3637 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3638 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 3639 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 3640 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 3641 Info.opc = ISD::INTRINSIC_W_CHAIN; 3642 Info.memVT = MVT::i32; 3643 Info.ptrVal = I.getArgOperand(0); 3644 Info.offset = 0; 3645 Info.flags = MachineMemOperand::MOLoad; 3646 Info.align = Align(4); 3647 return true; 3648 } 3649 3650 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3651 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3652 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3653 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3654 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3655 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3656 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3657 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3658 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3659 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3660 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3661 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3662 Info.opc = ISD::INTRINSIC_W_CHAIN; 3663 Info.memVT = MVT::v4f16; 3664 Info.ptrVal = I.getArgOperand(0); 3665 Info.offset = 0; 3666 Info.flags = MachineMemOperand::MOLoad; 3667 Info.align = Align(16); 3668 return true; 3669 } 3670 3671 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3672 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3673 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3674 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3675 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3676 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3677 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3678 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3679 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3680 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3681 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3682 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 3683 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 3684 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 3685 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 3686 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 3687 Info.opc = ISD::INTRINSIC_W_CHAIN; 3688 Info.memVT = MVT::v8f32; 3689 Info.ptrVal = I.getArgOperand(0); 3690 Info.offset = 0; 3691 Info.flags = MachineMemOperand::MOLoad; 3692 Info.align = Align(16); 3693 return true; 3694 } 3695 3696 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 3697 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 3698 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 3699 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 3700 3701 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 3702 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 3703 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 3704 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 3705 3706 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3707 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3708 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3709 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3710 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3711 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3712 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3713 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3714 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3715 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3716 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3717 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3718 Info.opc = ISD::INTRINSIC_W_CHAIN; 3719 Info.memVT = MVT::v8i32; 3720 Info.ptrVal = I.getArgOperand(0); 3721 Info.offset = 0; 3722 Info.flags = MachineMemOperand::MOLoad; 3723 Info.align = Align(16); 3724 return true; 3725 } 3726 3727 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3728 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3729 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3730 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3731 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3732 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3733 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3734 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 3735 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 3736 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 3737 Info.opc = ISD::INTRINSIC_W_CHAIN; 3738 Info.memVT = MVT::v2i32; 3739 Info.ptrVal = I.getArgOperand(0); 3740 Info.offset = 0; 3741 Info.flags = MachineMemOperand::MOLoad; 3742 Info.align = Align(8); 3743 return true; 3744 } 3745 3746 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 3747 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 3748 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 3749 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 3750 3751 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 3752 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 3753 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 3754 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 3755 Info.opc = ISD::INTRINSIC_W_CHAIN; 3756 Info.memVT = MVT::f64; 3757 Info.ptrVal = I.getArgOperand(0); 3758 Info.offset = 0; 3759 Info.flags = MachineMemOperand::MOLoad; 3760 Info.align = Align(8); 3761 return true; 3762 } 3763 3764 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 3765 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 3766 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 3767 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 3768 Info.opc = ISD::INTRINSIC_W_CHAIN; 3769 Info.memVT = MVT::v2f64; 3770 Info.ptrVal = I.getArgOperand(0); 3771 Info.offset = 0; 3772 Info.flags = MachineMemOperand::MOLoad; 3773 Info.align = Align(16); 3774 return true; 3775 } 3776 3777 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3778 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3779 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3780 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3781 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3782 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3783 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3784 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3785 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3786 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3787 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3788 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3789 Info.opc = ISD::INTRINSIC_VOID; 3790 Info.memVT = MVT::v4f16; 3791 Info.ptrVal = I.getArgOperand(0); 3792 Info.offset = 0; 3793 Info.flags = MachineMemOperand::MOStore; 3794 Info.align = Align(16); 3795 return true; 3796 } 3797 3798 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 3799 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 3800 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 3801 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 3802 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 3803 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 3804 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 3805 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 3806 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 3807 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 3808 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 3809 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 3810 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 3811 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 3812 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 3813 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 3814 Info.opc = ISD::INTRINSIC_VOID; 3815 Info.memVT = MVT::v8f32; 3816 Info.ptrVal = I.getArgOperand(0); 3817 Info.offset = 0; 3818 Info.flags = MachineMemOperand::MOStore; 3819 Info.align = Align(16); 3820 return true; 3821 } 3822 3823 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 3824 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 3825 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 3826 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 3827 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 3828 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 3829 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 3830 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 3831 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 3832 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 3833 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 3834 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 3835 Info.opc = ISD::INTRINSIC_VOID; 3836 Info.memVT = MVT::v8i32; 3837 Info.ptrVal = I.getArgOperand(0); 3838 Info.offset = 0; 3839 Info.flags = MachineMemOperand::MOStore; 3840 Info.align = Align(16); 3841 return true; 3842 } 3843 3844 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 3845 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 3846 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 3847 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 3848 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 3849 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 3850 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 3851 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 3852 Info.opc = ISD::INTRINSIC_VOID; 3853 Info.memVT = MVT::v2i32; 3854 Info.ptrVal = I.getArgOperand(0); 3855 Info.offset = 0; 3856 Info.flags = MachineMemOperand::MOStore; 3857 Info.align = Align(8); 3858 return true; 3859 } 3860 3861 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 3862 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 3863 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 3864 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 3865 Info.opc = ISD::INTRINSIC_VOID; 3866 Info.memVT = MVT::v2f64; 3867 Info.ptrVal = I.getArgOperand(0); 3868 Info.offset = 0; 3869 Info.flags = MachineMemOperand::MOStore; 3870 Info.align = Align(16); 3871 return true; 3872 } 3873 3874 case Intrinsic::nvvm_atomic_load_inc_32: 3875 case Intrinsic::nvvm_atomic_load_dec_32: 3876 3877 case Intrinsic::nvvm_atomic_add_gen_f_cta: 3878 case Intrinsic::nvvm_atomic_add_gen_f_sys: 3879 case Intrinsic::nvvm_atomic_add_gen_i_cta: 3880 case Intrinsic::nvvm_atomic_add_gen_i_sys: 3881 case Intrinsic::nvvm_atomic_and_gen_i_cta: 3882 case Intrinsic::nvvm_atomic_and_gen_i_sys: 3883 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 3884 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 3885 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 3886 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 3887 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 3888 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 3889 case Intrinsic::nvvm_atomic_max_gen_i_cta: 3890 case Intrinsic::nvvm_atomic_max_gen_i_sys: 3891 case Intrinsic::nvvm_atomic_min_gen_i_cta: 3892 case Intrinsic::nvvm_atomic_min_gen_i_sys: 3893 case Intrinsic::nvvm_atomic_or_gen_i_cta: 3894 case Intrinsic::nvvm_atomic_or_gen_i_sys: 3895 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 3896 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 3897 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 3898 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 3899 auto &DL = I.getModule()->getDataLayout(); 3900 Info.opc = ISD::INTRINSIC_W_CHAIN; 3901 Info.memVT = getValueType(DL, I.getType()); 3902 Info.ptrVal = I.getArgOperand(0); 3903 Info.offset = 0; 3904 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3905 Info.align.reset(); 3906 return true; 3907 } 3908 3909 case Intrinsic::nvvm_ldu_global_i: 3910 case Intrinsic::nvvm_ldu_global_f: 3911 case Intrinsic::nvvm_ldu_global_p: { 3912 auto &DL = I.getModule()->getDataLayout(); 3913 Info.opc = ISD::INTRINSIC_W_CHAIN; 3914 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 3915 Info.memVT = getValueType(DL, I.getType()); 3916 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 3917 Info.memVT = getPointerTy(DL); 3918 else 3919 Info.memVT = getValueType(DL, I.getType()); 3920 Info.ptrVal = I.getArgOperand(0); 3921 Info.offset = 0; 3922 Info.flags = MachineMemOperand::MOLoad; 3923 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3924 3925 return true; 3926 } 3927 case Intrinsic::nvvm_ldg_global_i: 3928 case Intrinsic::nvvm_ldg_global_f: 3929 case Intrinsic::nvvm_ldg_global_p: { 3930 auto &DL = I.getModule()->getDataLayout(); 3931 3932 Info.opc = ISD::INTRINSIC_W_CHAIN; 3933 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 3934 Info.memVT = getValueType(DL, I.getType()); 3935 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 3936 Info.memVT = getPointerTy(DL); 3937 else 3938 Info.memVT = getValueType(DL, I.getType()); 3939 Info.ptrVal = I.getArgOperand(0); 3940 Info.offset = 0; 3941 Info.flags = MachineMemOperand::MOLoad; 3942 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 3943 3944 return true; 3945 } 3946 3947 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3948 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3949 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3950 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3951 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3952 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3953 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3954 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3955 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3956 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3957 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3958 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3959 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3960 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3961 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3962 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3963 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3964 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3965 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3966 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3967 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3968 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3969 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3970 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3971 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3972 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3973 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3974 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3975 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3976 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3977 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3978 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3979 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3980 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3981 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3982 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3983 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3984 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3985 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3986 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3987 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3988 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3989 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3990 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3991 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3992 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3993 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3994 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3995 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3996 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3997 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3998 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3999 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4000 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4001 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4002 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4003 Info.opc = getOpcForTextureInstr(Intrinsic); 4004 Info.memVT = MVT::v4f32; 4005 Info.ptrVal = nullptr; 4006 Info.offset = 0; 4007 Info.flags = MachineMemOperand::MOLoad; 4008 Info.align = Align(16); 4009 return true; 4010 4011 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4012 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4013 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4014 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4015 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4016 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4017 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4018 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4019 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4020 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4021 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4022 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4023 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4024 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4025 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4026 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4027 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4028 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4029 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4030 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4031 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4032 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4033 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4034 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4035 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4036 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4037 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4038 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4039 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4040 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4041 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4042 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4043 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4044 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4045 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4046 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4047 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4048 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4049 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4050 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4051 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4052 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4053 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4054 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4055 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4056 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4057 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4058 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4059 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4060 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4061 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4062 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4063 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4064 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4065 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4066 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4067 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4068 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4069 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4070 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4071 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4072 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4073 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4074 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4075 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4076 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4077 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4078 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4079 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4080 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4081 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4082 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4083 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4084 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4085 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4086 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4087 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4088 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4089 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4090 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4091 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4092 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4093 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4094 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4095 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4096 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4097 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4098 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4099 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4100 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4101 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4102 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4103 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4104 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4105 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4106 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4107 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4108 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4109 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4110 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4111 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4112 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4113 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4114 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4115 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4116 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4117 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4118 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4119 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4120 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4121 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4122 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4123 Info.opc = getOpcForTextureInstr(Intrinsic); 4124 Info.memVT = MVT::v4i32; 4125 Info.ptrVal = nullptr; 4126 Info.offset = 0; 4127 Info.flags = MachineMemOperand::MOLoad; 4128 Info.align = Align(16); 4129 return true; 4130 4131 case Intrinsic::nvvm_suld_1d_i8_clamp: 4132 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4133 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4134 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4135 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4136 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4137 case Intrinsic::nvvm_suld_2d_i8_clamp: 4138 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4139 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4140 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4141 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4142 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4143 case Intrinsic::nvvm_suld_3d_i8_clamp: 4144 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4145 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4146 case Intrinsic::nvvm_suld_1d_i8_trap: 4147 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4148 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4149 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4150 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4151 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4152 case Intrinsic::nvvm_suld_2d_i8_trap: 4153 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4154 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4155 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4156 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4157 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4158 case Intrinsic::nvvm_suld_3d_i8_trap: 4159 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4160 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4161 case Intrinsic::nvvm_suld_1d_i8_zero: 4162 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4163 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4164 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4165 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4166 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4167 case Intrinsic::nvvm_suld_2d_i8_zero: 4168 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4169 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4170 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4171 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4172 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4173 case Intrinsic::nvvm_suld_3d_i8_zero: 4174 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4175 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4176 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4177 Info.memVT = MVT::i8; 4178 Info.ptrVal = nullptr; 4179 Info.offset = 0; 4180 Info.flags = MachineMemOperand::MOLoad; 4181 Info.align = Align(16); 4182 return true; 4183 4184 case Intrinsic::nvvm_suld_1d_i16_clamp: 4185 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4186 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4187 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4188 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4189 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4190 case Intrinsic::nvvm_suld_2d_i16_clamp: 4191 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4192 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4193 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4194 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4195 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4196 case Intrinsic::nvvm_suld_3d_i16_clamp: 4197 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4198 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4199 case Intrinsic::nvvm_suld_1d_i16_trap: 4200 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4201 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4202 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4203 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4204 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4205 case Intrinsic::nvvm_suld_2d_i16_trap: 4206 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4207 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4208 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4209 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4210 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4211 case Intrinsic::nvvm_suld_3d_i16_trap: 4212 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4213 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4214 case Intrinsic::nvvm_suld_1d_i16_zero: 4215 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4216 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4217 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4218 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4219 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4220 case Intrinsic::nvvm_suld_2d_i16_zero: 4221 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4222 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4223 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4224 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4225 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4226 case Intrinsic::nvvm_suld_3d_i16_zero: 4227 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4228 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4229 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4230 Info.memVT = MVT::i16; 4231 Info.ptrVal = nullptr; 4232 Info.offset = 0; 4233 Info.flags = MachineMemOperand::MOLoad; 4234 Info.align = Align(16); 4235 return true; 4236 4237 case Intrinsic::nvvm_suld_1d_i32_clamp: 4238 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4239 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4240 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4241 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4242 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4243 case Intrinsic::nvvm_suld_2d_i32_clamp: 4244 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4245 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4246 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4247 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4248 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4249 case Intrinsic::nvvm_suld_3d_i32_clamp: 4250 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4251 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4252 case Intrinsic::nvvm_suld_1d_i32_trap: 4253 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4254 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4255 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4256 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4257 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4258 case Intrinsic::nvvm_suld_2d_i32_trap: 4259 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4260 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4261 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4262 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4263 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4264 case Intrinsic::nvvm_suld_3d_i32_trap: 4265 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4266 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4267 case Intrinsic::nvvm_suld_1d_i32_zero: 4268 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4269 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4270 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4271 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4272 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4273 case Intrinsic::nvvm_suld_2d_i32_zero: 4274 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4275 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4276 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4277 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4278 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4279 case Intrinsic::nvvm_suld_3d_i32_zero: 4280 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4281 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4282 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4283 Info.memVT = MVT::i32; 4284 Info.ptrVal = nullptr; 4285 Info.offset = 0; 4286 Info.flags = MachineMemOperand::MOLoad; 4287 Info.align = Align(16); 4288 return true; 4289 4290 case Intrinsic::nvvm_suld_1d_i64_clamp: 4291 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4292 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4293 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4294 case Intrinsic::nvvm_suld_2d_i64_clamp: 4295 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4296 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4297 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4298 case Intrinsic::nvvm_suld_3d_i64_clamp: 4299 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4300 case Intrinsic::nvvm_suld_1d_i64_trap: 4301 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4302 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4303 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4304 case Intrinsic::nvvm_suld_2d_i64_trap: 4305 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4306 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4307 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4308 case Intrinsic::nvvm_suld_3d_i64_trap: 4309 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4310 case Intrinsic::nvvm_suld_1d_i64_zero: 4311 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4312 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4313 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4314 case Intrinsic::nvvm_suld_2d_i64_zero: 4315 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4316 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4317 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4318 case Intrinsic::nvvm_suld_3d_i64_zero: 4319 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4320 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4321 Info.memVT = MVT::i64; 4322 Info.ptrVal = nullptr; 4323 Info.offset = 0; 4324 Info.flags = MachineMemOperand::MOLoad; 4325 Info.align = Align(16); 4326 return true; 4327 } 4328 return false; 4329 } 4330 4331 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4332 /// .param space, we may want to increase their alignment in a way that 4333 /// ensures that we can effectively vectorize their loads & stores. We can 4334 /// increase alignment only if the function has internal or has private 4335 /// linkage as for other linkage types callers may already rely on default 4336 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4337 /// ensures that alignment is 16 or greater. 4338 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4339 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4340 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4341 4342 // If a function has linkage different from internal or private, we 4343 // must use default ABI alignment as external users rely on it. 4344 if (!F->hasLocalLinkage()) 4345 return Align(ABITypeAlign); 4346 4347 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4348 return Align(std::max(uint64_t(16), ABITypeAlign)); 4349 } 4350 4351 /// isLegalAddressingMode - Return true if the addressing mode represented 4352 /// by AM is legal for this target, for a load/store of the specified type. 4353 /// Used to guide target specific optimizations, like loop strength reduction 4354 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4355 /// (CodeGenPrepare.cpp) 4356 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4357 const AddrMode &AM, Type *Ty, 4358 unsigned AS, Instruction *I) const { 4359 // AddrMode - This represents an addressing mode of: 4360 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4361 // 4362 // The legal address modes are 4363 // - [avar] 4364 // - [areg] 4365 // - [areg+immoff] 4366 // - [immAddr] 4367 4368 if (AM.BaseGV) { 4369 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4370 } 4371 4372 switch (AM.Scale) { 4373 case 0: // "r", "r+i" or "i" is allowed 4374 break; 4375 case 1: 4376 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4377 return false; 4378 // Otherwise we have r+i. 4379 break; 4380 default: 4381 // No scale > 1 is allowed 4382 return false; 4383 } 4384 return true; 4385 } 4386 4387 //===----------------------------------------------------------------------===// 4388 // NVPTX Inline Assembly Support 4389 //===----------------------------------------------------------------------===// 4390 4391 /// getConstraintType - Given a constraint letter, return the type of 4392 /// constraint it is for this target. 4393 NVPTXTargetLowering::ConstraintType 4394 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4395 if (Constraint.size() == 1) { 4396 switch (Constraint[0]) { 4397 default: 4398 break; 4399 case 'b': 4400 case 'r': 4401 case 'h': 4402 case 'c': 4403 case 'l': 4404 case 'f': 4405 case 'd': 4406 case '0': 4407 case 'N': 4408 return C_RegisterClass; 4409 } 4410 } 4411 return TargetLowering::getConstraintType(Constraint); 4412 } 4413 4414 std::pair<unsigned, const TargetRegisterClass *> 4415 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4416 StringRef Constraint, 4417 MVT VT) const { 4418 if (Constraint.size() == 1) { 4419 switch (Constraint[0]) { 4420 case 'b': 4421 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4422 case 'c': 4423 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4424 case 'h': 4425 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4426 case 'r': 4427 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4428 case 'l': 4429 case 'N': 4430 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4431 case 'f': 4432 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4433 case 'd': 4434 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4435 } 4436 } 4437 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4438 } 4439 4440 //===----------------------------------------------------------------------===// 4441 // NVPTX DAG Combining 4442 //===----------------------------------------------------------------------===// 4443 4444 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4445 CodeGenOpt::Level OptLevel) const { 4446 // Always honor command-line argument 4447 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4448 return FMAContractLevelOpt > 0; 4449 4450 // Do not contract if we're not optimizing the code. 4451 if (OptLevel == 0) 4452 return false; 4453 4454 // Honor TargetOptions flags that explicitly say fusion is okay. 4455 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4456 return true; 4457 4458 return allowUnsafeFPMath(MF); 4459 } 4460 4461 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4462 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4463 if (MF.getTarget().Options.UnsafeFPMath) 4464 return true; 4465 4466 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4467 const Function &F = MF.getFunction(); 4468 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 4469 } 4470 4471 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4472 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4473 /// called with the default operands, and if that fails, with commuted 4474 /// operands. 4475 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4476 TargetLowering::DAGCombinerInfo &DCI, 4477 const NVPTXSubtarget &Subtarget, 4478 CodeGenOpt::Level OptLevel) { 4479 SelectionDAG &DAG = DCI.DAG; 4480 // Skip non-integer, non-scalar case 4481 EVT VT=N0.getValueType(); 4482 if (VT.isVector()) 4483 return SDValue(); 4484 4485 // fold (add (mul a, b), c) -> (mad a, b, c) 4486 // 4487 if (N0.getOpcode() == ISD::MUL) { 4488 assert (VT.isInteger()); 4489 // For integer: 4490 // Since integer multiply-add costs the same as integer multiply 4491 // but is more costly than integer add, do the fusion only when 4492 // the mul is only used in the add. 4493 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4494 !N0.getNode()->hasOneUse()) 4495 return SDValue(); 4496 4497 // Do the folding 4498 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4499 N0.getOperand(0), N0.getOperand(1), N1); 4500 } 4501 else if (N0.getOpcode() == ISD::FMUL) { 4502 if (VT == MVT::f32 || VT == MVT::f64) { 4503 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4504 &DAG.getTargetLoweringInfo()); 4505 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4506 return SDValue(); 4507 4508 // For floating point: 4509 // Do the fusion only when the mul has less than 5 uses and all 4510 // are add. 4511 // The heuristic is that if a use is not an add, then that use 4512 // cannot be fused into fma, therefore mul is still needed anyway. 4513 // If there are more than 4 uses, even if they are all add, fusing 4514 // them will increase register pressue. 4515 // 4516 int numUses = 0; 4517 int nonAddCount = 0; 4518 for (const SDNode *User : N0.getNode()->uses()) { 4519 numUses++; 4520 if (User->getOpcode() != ISD::FADD) 4521 ++nonAddCount; 4522 } 4523 if (numUses >= 5) 4524 return SDValue(); 4525 if (nonAddCount) { 4526 int orderNo = N->getIROrder(); 4527 int orderNo2 = N0.getNode()->getIROrder(); 4528 // simple heuristics here for considering potential register 4529 // pressure, the logics here is that the differnce are used 4530 // to measure the distance between def and use, the longer distance 4531 // more likely cause register pressure. 4532 if (orderNo - orderNo2 < 500) 4533 return SDValue(); 4534 4535 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4536 // which guarantees that the FMA will not increase register pressure at node N. 4537 bool opIsLive = false; 4538 const SDNode *left = N0.getOperand(0).getNode(); 4539 const SDNode *right = N0.getOperand(1).getNode(); 4540 4541 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4542 opIsLive = true; 4543 4544 if (!opIsLive) 4545 for (const SDNode *User : left->uses()) { 4546 int orderNo3 = User->getIROrder(); 4547 if (orderNo3 > orderNo) { 4548 opIsLive = true; 4549 break; 4550 } 4551 } 4552 4553 if (!opIsLive) 4554 for (const SDNode *User : right->uses()) { 4555 int orderNo3 = User->getIROrder(); 4556 if (orderNo3 > orderNo) { 4557 opIsLive = true; 4558 break; 4559 } 4560 } 4561 4562 if (!opIsLive) 4563 return SDValue(); 4564 } 4565 4566 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4567 N0.getOperand(0), N0.getOperand(1), N1); 4568 } 4569 } 4570 4571 return SDValue(); 4572 } 4573 4574 static SDValue PerformStoreRetvalCombine(SDNode *N) { 4575 // Operands from the 2nd to the last one are the values to be stored 4576 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 4577 if (!N->getOperand(I).isUndef()) 4578 return SDValue(); 4579 4580 // Operand 0 is the previous value in the chain. Cannot return EntryToken 4581 // as the previous value will become unused and eliminated later. 4582 return N->getOperand(0); 4583 } 4584 4585 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4586 /// 4587 static SDValue PerformADDCombine(SDNode *N, 4588 TargetLowering::DAGCombinerInfo &DCI, 4589 const NVPTXSubtarget &Subtarget, 4590 CodeGenOpt::Level OptLevel) { 4591 SDValue N0 = N->getOperand(0); 4592 SDValue N1 = N->getOperand(1); 4593 4594 // First try with the default operand order. 4595 if (SDValue Result = 4596 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4597 return Result; 4598 4599 // If that didn't work, try again with the operands commuted. 4600 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4601 } 4602 4603 static SDValue PerformANDCombine(SDNode *N, 4604 TargetLowering::DAGCombinerInfo &DCI) { 4605 // The type legalizer turns a vector load of i8 values into a zextload to i16 4606 // registers, optionally ANY_EXTENDs it (if target type is integer), 4607 // and ANDs off the high 8 bits. Since we turn this load into a 4608 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4609 // nodes. Do that here. 4610 SDValue Val = N->getOperand(0); 4611 SDValue Mask = N->getOperand(1); 4612 4613 if (isa<ConstantSDNode>(Val)) { 4614 std::swap(Val, Mask); 4615 } 4616 4617 SDValue AExt; 4618 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4619 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4620 AExt = Val; 4621 Val = Val->getOperand(0); 4622 } 4623 4624 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4625 Val = Val->getOperand(0); 4626 } 4627 4628 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4629 Val->getOpcode() == NVPTXISD::LoadV4) { 4630 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4631 if (!MaskCnst) { 4632 // Not an AND with a constant 4633 return SDValue(); 4634 } 4635 4636 uint64_t MaskVal = MaskCnst->getZExtValue(); 4637 if (MaskVal != 0xff) { 4638 // Not an AND that chops off top 8 bits 4639 return SDValue(); 4640 } 4641 4642 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4643 if (!Mem) { 4644 // Not a MemSDNode?!? 4645 return SDValue(); 4646 } 4647 4648 EVT MemVT = Mem->getMemoryVT(); 4649 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4650 // We only handle the i8 case 4651 return SDValue(); 4652 } 4653 4654 unsigned ExtType = 4655 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4656 getZExtValue(); 4657 if (ExtType == ISD::SEXTLOAD) { 4658 // If for some reason the load is a sextload, the and is needed to zero 4659 // out the high 8 bits 4660 return SDValue(); 4661 } 4662 4663 bool AddTo = false; 4664 if (AExt.getNode() != nullptr) { 4665 // Re-insert the ext as a zext. 4666 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4667 AExt.getValueType(), Val); 4668 AddTo = true; 4669 } 4670 4671 // If we get here, the AND is unnecessary. Just replace it with the load 4672 DCI.CombineTo(N, Val, AddTo); 4673 } 4674 4675 return SDValue(); 4676 } 4677 4678 static SDValue PerformREMCombine(SDNode *N, 4679 TargetLowering::DAGCombinerInfo &DCI, 4680 CodeGenOpt::Level OptLevel) { 4681 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4682 4683 // Don't do anything at less than -O2. 4684 if (OptLevel < CodeGenOpt::Default) 4685 return SDValue(); 4686 4687 SelectionDAG &DAG = DCI.DAG; 4688 SDLoc DL(N); 4689 EVT VT = N->getValueType(0); 4690 bool IsSigned = N->getOpcode() == ISD::SREM; 4691 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4692 4693 const SDValue &Num = N->getOperand(0); 4694 const SDValue &Den = N->getOperand(1); 4695 4696 for (const SDNode *U : Num->uses()) { 4697 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4698 U->getOperand(1) == Den) { 4699 // Num % Den -> Num - (Num / Den) * Den 4700 return DAG.getNode(ISD::SUB, DL, VT, Num, 4701 DAG.getNode(ISD::MUL, DL, VT, 4702 DAG.getNode(DivOpc, DL, VT, Num, Den), 4703 Den)); 4704 } 4705 } 4706 return SDValue(); 4707 } 4708 4709 enum OperandSignedness { 4710 Signed = 0, 4711 Unsigned, 4712 Unknown 4713 }; 4714 4715 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4716 /// that can be demoted to \p OptSize bits without loss of information. The 4717 /// signedness of the operand, if determinable, is placed in \p S. 4718 static bool IsMulWideOperandDemotable(SDValue Op, 4719 unsigned OptSize, 4720 OperandSignedness &S) { 4721 S = Unknown; 4722 4723 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4724 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4725 EVT OrigVT = Op.getOperand(0).getValueType(); 4726 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4727 S = Signed; 4728 return true; 4729 } 4730 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4731 EVT OrigVT = Op.getOperand(0).getValueType(); 4732 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4733 S = Unsigned; 4734 return true; 4735 } 4736 } 4737 4738 return false; 4739 } 4740 4741 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4742 /// be demoted to \p OptSize bits without loss of information. If the operands 4743 /// contain a constant, it should appear as the RHS operand. The signedness of 4744 /// the operands is placed in \p IsSigned. 4745 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4746 unsigned OptSize, 4747 bool &IsSigned) { 4748 OperandSignedness LHSSign; 4749 4750 // The LHS operand must be a demotable op 4751 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 4752 return false; 4753 4754 // We should have been able to determine the signedness from the LHS 4755 if (LHSSign == Unknown) 4756 return false; 4757 4758 IsSigned = (LHSSign == Signed); 4759 4760 // The RHS can be a demotable op or a constant 4761 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 4762 const APInt &Val = CI->getAPIntValue(); 4763 if (LHSSign == Unsigned) { 4764 return Val.isIntN(OptSize); 4765 } else { 4766 return Val.isSignedIntN(OptSize); 4767 } 4768 } else { 4769 OperandSignedness RHSSign; 4770 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 4771 return false; 4772 4773 return LHSSign == RHSSign; 4774 } 4775 } 4776 4777 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4778 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4779 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4780 /// amount. 4781 static SDValue TryMULWIDECombine(SDNode *N, 4782 TargetLowering::DAGCombinerInfo &DCI) { 4783 EVT MulType = N->getValueType(0); 4784 if (MulType != MVT::i32 && MulType != MVT::i64) { 4785 return SDValue(); 4786 } 4787 4788 SDLoc DL(N); 4789 unsigned OptSize = MulType.getSizeInBits() >> 1; 4790 SDValue LHS = N->getOperand(0); 4791 SDValue RHS = N->getOperand(1); 4792 4793 // Canonicalize the multiply so the constant (if any) is on the right 4794 if (N->getOpcode() == ISD::MUL) { 4795 if (isa<ConstantSDNode>(LHS)) { 4796 std::swap(LHS, RHS); 4797 } 4798 } 4799 4800 // If we have a SHL, determine the actual multiply amount 4801 if (N->getOpcode() == ISD::SHL) { 4802 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 4803 if (!ShlRHS) { 4804 return SDValue(); 4805 } 4806 4807 APInt ShiftAmt = ShlRHS->getAPIntValue(); 4808 unsigned BitWidth = MulType.getSizeInBits(); 4809 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 4810 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 4811 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 4812 } else { 4813 return SDValue(); 4814 } 4815 } 4816 4817 bool Signed; 4818 // Verify that our operands are demotable 4819 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 4820 return SDValue(); 4821 } 4822 4823 EVT DemotedVT; 4824 if (MulType == MVT::i32) { 4825 DemotedVT = MVT::i16; 4826 } else { 4827 DemotedVT = MVT::i32; 4828 } 4829 4830 // Truncate the operands to the correct size. Note that these are just for 4831 // type consistency and will (likely) be eliminated in later phases. 4832 SDValue TruncLHS = 4833 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 4834 SDValue TruncRHS = 4835 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 4836 4837 unsigned Opc; 4838 if (Signed) { 4839 Opc = NVPTXISD::MUL_WIDE_SIGNED; 4840 } else { 4841 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 4842 } 4843 4844 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 4845 } 4846 4847 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 4848 static SDValue PerformMULCombine(SDNode *N, 4849 TargetLowering::DAGCombinerInfo &DCI, 4850 CodeGenOpt::Level OptLevel) { 4851 if (OptLevel > 0) { 4852 // Try mul.wide combining at OptLevel > 0 4853 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4854 return Ret; 4855 } 4856 4857 return SDValue(); 4858 } 4859 4860 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 4861 static SDValue PerformSHLCombine(SDNode *N, 4862 TargetLowering::DAGCombinerInfo &DCI, 4863 CodeGenOpt::Level OptLevel) { 4864 if (OptLevel > 0) { 4865 // Try mul.wide combining at OptLevel > 0 4866 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 4867 return Ret; 4868 } 4869 4870 return SDValue(); 4871 } 4872 4873 static SDValue PerformSETCCCombine(SDNode *N, 4874 TargetLowering::DAGCombinerInfo &DCI) { 4875 EVT CCType = N->getValueType(0); 4876 SDValue A = N->getOperand(0); 4877 SDValue B = N->getOperand(1); 4878 4879 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 4880 return SDValue(); 4881 4882 SDLoc DL(N); 4883 // setp.f16x2 returns two scalar predicates, which we need to 4884 // convert back to v2i1. The returned result will be scalarized by 4885 // the legalizer, but the comparison will remain a single vector 4886 // instruction. 4887 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 4888 DCI.DAG.getVTList(MVT::i1, MVT::i1), 4889 {A, B, N->getOperand(2)}); 4890 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 4891 CCNode.getValue(1)); 4892 } 4893 4894 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 4895 DAGCombinerInfo &DCI) const { 4896 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 4897 switch (N->getOpcode()) { 4898 default: break; 4899 case ISD::ADD: 4900 case ISD::FADD: 4901 return PerformADDCombine(N, DCI, STI, OptLevel); 4902 case ISD::MUL: 4903 return PerformMULCombine(N, DCI, OptLevel); 4904 case ISD::SHL: 4905 return PerformSHLCombine(N, DCI, OptLevel); 4906 case ISD::AND: 4907 return PerformANDCombine(N, DCI); 4908 case ISD::UREM: 4909 case ISD::SREM: 4910 return PerformREMCombine(N, DCI, OptLevel); 4911 case ISD::SETCC: 4912 return PerformSETCCCombine(N, DCI); 4913 case NVPTXISD::StoreRetval: 4914 case NVPTXISD::StoreRetvalV2: 4915 case NVPTXISD::StoreRetvalV4: 4916 return PerformStoreRetvalCombine(N); 4917 } 4918 return SDValue(); 4919 } 4920 4921 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 4922 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 4923 SmallVectorImpl<SDValue> &Results) { 4924 EVT ResVT = N->getValueType(0); 4925 SDLoc DL(N); 4926 4927 assert(ResVT.isVector() && "Vector load must have vector type"); 4928 4929 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 4930 // legal. We can (and should) split that into 2 loads of <2 x double> here 4931 // but I'm leaving that as a TODO for now. 4932 assert(ResVT.isSimple() && "Can only handle simple types"); 4933 switch (ResVT.getSimpleVT().SimpleTy) { 4934 default: 4935 return; 4936 case MVT::v2i8: 4937 case MVT::v2i16: 4938 case MVT::v2i32: 4939 case MVT::v2i64: 4940 case MVT::v2f16: 4941 case MVT::v2f32: 4942 case MVT::v2f64: 4943 case MVT::v4i8: 4944 case MVT::v4i16: 4945 case MVT::v4i32: 4946 case MVT::v4f16: 4947 case MVT::v4f32: 4948 case MVT::v8f16: // <4 x f16x2> 4949 // This is a "native" vector type 4950 break; 4951 } 4952 4953 LoadSDNode *LD = cast<LoadSDNode>(N); 4954 4955 Align Alignment = LD->getAlign(); 4956 auto &TD = DAG.getDataLayout(); 4957 Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext())); 4958 if (Alignment < PrefAlign) { 4959 // This load is not sufficiently aligned, so bail out and let this vector 4960 // load be scalarized. Note that we may still be able to emit smaller 4961 // vector loads. For example, if we are loading a <4 x float> with an 4962 // alignment of 8, this check will fail but the legalizer will try again 4963 // with 2 x <2 x float>, which will succeed with an alignment of 8. 4964 return; 4965 } 4966 4967 EVT EltVT = ResVT.getVectorElementType(); 4968 unsigned NumElts = ResVT.getVectorNumElements(); 4969 4970 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 4971 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 4972 // loaded type to i16 and propagate the "real" type as the memory type. 4973 bool NeedTrunc = false; 4974 if (EltVT.getSizeInBits() < 16) { 4975 EltVT = MVT::i16; 4976 NeedTrunc = true; 4977 } 4978 4979 unsigned Opcode = 0; 4980 SDVTList LdResVTs; 4981 bool LoadF16x2 = false; 4982 4983 switch (NumElts) { 4984 default: 4985 return; 4986 case 2: 4987 Opcode = NVPTXISD::LoadV2; 4988 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 4989 break; 4990 case 4: { 4991 Opcode = NVPTXISD::LoadV4; 4992 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 4993 LdResVTs = DAG.getVTList(ListVTs); 4994 break; 4995 } 4996 case 8: { 4997 // v8f16 is a special case. PTX doesn't have ld.v8.f16 4998 // instruction. Instead, we split the vector into v2f16 chunks and 4999 // load them with ld.v4.b32. 5000 assert(EltVT == MVT::f16 && "Unsupported v8 vector type."); 5001 LoadF16x2 = true; 5002 Opcode = NVPTXISD::LoadV4; 5003 EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16, 5004 MVT::Other}; 5005 LdResVTs = DAG.getVTList(ListVTs); 5006 break; 5007 } 5008 } 5009 5010 // Copy regular operands 5011 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5012 5013 // The select routine does not have access to the LoadSDNode instance, so 5014 // pass along the extension information 5015 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5016 5017 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5018 LD->getMemoryVT(), 5019 LD->getMemOperand()); 5020 5021 SmallVector<SDValue, 8> ScalarRes; 5022 if (LoadF16x2) { 5023 // Split v2f16 subvectors back into individual elements. 5024 NumElts /= 2; 5025 for (unsigned i = 0; i < NumElts; ++i) { 5026 SDValue SubVector = NewLD.getValue(i); 5027 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5028 DAG.getIntPtrConstant(0, DL)); 5029 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5030 DAG.getIntPtrConstant(1, DL)); 5031 ScalarRes.push_back(E0); 5032 ScalarRes.push_back(E1); 5033 } 5034 } else { 5035 for (unsigned i = 0; i < NumElts; ++i) { 5036 SDValue Res = NewLD.getValue(i); 5037 if (NeedTrunc) 5038 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5039 ScalarRes.push_back(Res); 5040 } 5041 } 5042 5043 SDValue LoadChain = NewLD.getValue(NumElts); 5044 5045 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5046 5047 Results.push_back(BuildVec); 5048 Results.push_back(LoadChain); 5049 } 5050 5051 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5052 SmallVectorImpl<SDValue> &Results) { 5053 SDValue Chain = N->getOperand(0); 5054 SDValue Intrin = N->getOperand(1); 5055 SDLoc DL(N); 5056 5057 // Get the intrinsic ID 5058 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 5059 switch (IntrinNo) { 5060 default: 5061 return; 5062 case Intrinsic::nvvm_ldg_global_i: 5063 case Intrinsic::nvvm_ldg_global_f: 5064 case Intrinsic::nvvm_ldg_global_p: 5065 case Intrinsic::nvvm_ldu_global_i: 5066 case Intrinsic::nvvm_ldu_global_f: 5067 case Intrinsic::nvvm_ldu_global_p: { 5068 EVT ResVT = N->getValueType(0); 5069 5070 if (ResVT.isVector()) { 5071 // Vector LDG/LDU 5072 5073 unsigned NumElts = ResVT.getVectorNumElements(); 5074 EVT EltVT = ResVT.getVectorElementType(); 5075 5076 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5077 // legalization. 5078 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5079 // loaded type to i16 and propagate the "real" type as the memory type. 5080 bool NeedTrunc = false; 5081 if (EltVT.getSizeInBits() < 16) { 5082 EltVT = MVT::i16; 5083 NeedTrunc = true; 5084 } 5085 5086 unsigned Opcode = 0; 5087 SDVTList LdResVTs; 5088 5089 switch (NumElts) { 5090 default: 5091 return; 5092 case 2: 5093 switch (IntrinNo) { 5094 default: 5095 return; 5096 case Intrinsic::nvvm_ldg_global_i: 5097 case Intrinsic::nvvm_ldg_global_f: 5098 case Intrinsic::nvvm_ldg_global_p: 5099 Opcode = NVPTXISD::LDGV2; 5100 break; 5101 case Intrinsic::nvvm_ldu_global_i: 5102 case Intrinsic::nvvm_ldu_global_f: 5103 case Intrinsic::nvvm_ldu_global_p: 5104 Opcode = NVPTXISD::LDUV2; 5105 break; 5106 } 5107 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5108 break; 5109 case 4: { 5110 switch (IntrinNo) { 5111 default: 5112 return; 5113 case Intrinsic::nvvm_ldg_global_i: 5114 case Intrinsic::nvvm_ldg_global_f: 5115 case Intrinsic::nvvm_ldg_global_p: 5116 Opcode = NVPTXISD::LDGV4; 5117 break; 5118 case Intrinsic::nvvm_ldu_global_i: 5119 case Intrinsic::nvvm_ldu_global_f: 5120 case Intrinsic::nvvm_ldu_global_p: 5121 Opcode = NVPTXISD::LDUV4; 5122 break; 5123 } 5124 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5125 LdResVTs = DAG.getVTList(ListVTs); 5126 break; 5127 } 5128 } 5129 5130 SmallVector<SDValue, 8> OtherOps; 5131 5132 // Copy regular operands 5133 5134 OtherOps.push_back(Chain); // Chain 5135 // Skip operand 1 (intrinsic ID) 5136 // Others 5137 OtherOps.append(N->op_begin() + 2, N->op_end()); 5138 5139 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5140 5141 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5142 MemSD->getMemoryVT(), 5143 MemSD->getMemOperand()); 5144 5145 SmallVector<SDValue, 4> ScalarRes; 5146 5147 for (unsigned i = 0; i < NumElts; ++i) { 5148 SDValue Res = NewLD.getValue(i); 5149 if (NeedTrunc) 5150 Res = 5151 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5152 ScalarRes.push_back(Res); 5153 } 5154 5155 SDValue LoadChain = NewLD.getValue(NumElts); 5156 5157 SDValue BuildVec = 5158 DAG.getBuildVector(ResVT, DL, ScalarRes); 5159 5160 Results.push_back(BuildVec); 5161 Results.push_back(LoadChain); 5162 } else { 5163 // i8 LDG/LDU 5164 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5165 "Custom handling of non-i8 ldu/ldg?"); 5166 5167 // Just copy all operands as-is 5168 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5169 5170 // Force output to i16 5171 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5172 5173 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5174 5175 // We make sure the memory type is i8, which will be used during isel 5176 // to select the proper instruction. 5177 SDValue NewLD = 5178 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5179 MVT::i8, MemSD->getMemOperand()); 5180 5181 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5182 NewLD.getValue(0))); 5183 Results.push_back(NewLD.getValue(1)); 5184 } 5185 } 5186 } 5187 } 5188 5189 void NVPTXTargetLowering::ReplaceNodeResults( 5190 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5191 switch (N->getOpcode()) { 5192 default: 5193 report_fatal_error("Unhandled custom legalization"); 5194 case ISD::LOAD: 5195 ReplaceLoadVector(N, DAG, Results); 5196 return; 5197 case ISD::INTRINSIC_W_CHAIN: 5198 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5199 return; 5200 } 5201 } 5202 5203 NVPTXTargetLowering::AtomicExpansionKind 5204 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5205 Type *Ty = AI->getValOperand()->getType(); 5206 5207 if (AI->isFloatingPointOperation()) { 5208 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5209 if (Ty->isFloatTy()) 5210 return AtomicExpansionKind::None; 5211 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5212 return AtomicExpansionKind::None; 5213 } 5214 return AtomicExpansionKind::CmpXChg; 5215 } 5216 5217 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 5218 auto ITy = cast<llvm::IntegerType>(Ty); 5219 5220 switch (AI->getOperation()) { 5221 default: 5222 return AtomicExpansionKind::CmpXChg; 5223 case AtomicRMWInst::BinOp::And: 5224 case AtomicRMWInst::BinOp::Or: 5225 case AtomicRMWInst::BinOp::Xor: 5226 case AtomicRMWInst::BinOp::Xchg: 5227 switch (ITy->getBitWidth()) { 5228 case 8: 5229 case 16: 5230 return AtomicExpansionKind::CmpXChg; 5231 case 32: 5232 return AtomicExpansionKind::None; 5233 case 64: 5234 if (STI.hasAtomBitwise64()) 5235 return AtomicExpansionKind::None; 5236 return AtomicExpansionKind::CmpXChg; 5237 default: 5238 llvm_unreachable("unsupported width encountered"); 5239 } 5240 case AtomicRMWInst::BinOp::Add: 5241 case AtomicRMWInst::BinOp::Sub: 5242 case AtomicRMWInst::BinOp::Max: 5243 case AtomicRMWInst::BinOp::Min: 5244 case AtomicRMWInst::BinOp::UMax: 5245 case AtomicRMWInst::BinOp::UMin: 5246 switch (ITy->getBitWidth()) { 5247 case 8: 5248 case 16: 5249 return AtomicExpansionKind::CmpXChg; 5250 case 32: 5251 return AtomicExpansionKind::None; 5252 case 64: 5253 if (STI.hasAtomMinMax64()) 5254 return AtomicExpansionKind::None; 5255 return AtomicExpansionKind::CmpXChg; 5256 default: 5257 llvm_unreachable("unsupported width encountered"); 5258 } 5259 } 5260 5261 return AtomicExpansionKind::CmpXChg; 5262 } 5263 5264 // Pin NVPTXTargetObjectFile's vtables to this file. 5265 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 5266 5267 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5268 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5269 return getDataSection(); 5270 } 5271