1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/MachineFunction.h" 27 #include "llvm/CodeGen/MachineMemOperand.h" 28 #include "llvm/CodeGen/MachineValueType.h" 29 #include "llvm/CodeGen/SelectionDAG.h" 30 #include "llvm/CodeGen/SelectionDAGNodes.h" 31 #include "llvm/CodeGen/TargetCallingConv.h" 32 #include "llvm/CodeGen/TargetLowering.h" 33 #include "llvm/CodeGen/ValueTypes.h" 34 #include "llvm/IR/Argument.h" 35 #include "llvm/IR/Attributes.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/DataLayout.h" 38 #include "llvm/IR/DerivedTypes.h" 39 #include "llvm/IR/FPEnv.h" 40 #include "llvm/IR/Function.h" 41 #include "llvm/IR/GlobalValue.h" 42 #include "llvm/IR/Instruction.h" 43 #include "llvm/IR/Instructions.h" 44 #include "llvm/IR/IntrinsicsNVPTX.h" 45 #include "llvm/IR/Module.h" 46 #include "llvm/IR/Type.h" 47 #include "llvm/IR/Value.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CodeGen.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Support/raw_ostream.h" 53 #include "llvm/Target/TargetMachine.h" 54 #include "llvm/Target/TargetOptions.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cmath> 58 #include <cstdint> 59 #include <iterator> 60 #include <sstream> 61 #include <string> 62 #include <utility> 63 #include <vector> 64 65 #define DEBUG_TYPE "nvptx-lower" 66 67 using namespace llvm; 68 69 static std::atomic<unsigned> GlobalUniqueCallSite; 70 71 static cl::opt<bool> sched4reg( 72 "nvptx-sched4reg", 73 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 74 75 static cl::opt<unsigned> FMAContractLevelOpt( 76 "nvptx-fma-level", cl::Hidden, 77 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 78 " 1: do it 2: do it aggressively"), 79 cl::init(2)); 80 81 static cl::opt<int> UsePrecDivF32( 82 "nvptx-prec-divf32", cl::Hidden, 83 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 84 " IEEE Compliant F32 div.rnd if available."), 85 cl::init(2)); 86 87 static cl::opt<bool> UsePrecSqrtF32( 88 "nvptx-prec-sqrtf32", cl::Hidden, 89 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 90 cl::init(true)); 91 92 static cl::opt<bool> ForceMinByValParamAlign( 93 "nvptx-force-min-byval-param-align", cl::Hidden, 94 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 95 " params of device functions."), 96 cl::init(false)); 97 98 int NVPTXTargetLowering::getDivF32Level() const { 99 if (UsePrecDivF32.getNumOccurrences() > 0) { 100 // If nvptx-prec-div32=N is used on the command-line, always honor it 101 return UsePrecDivF32; 102 } else { 103 // Otherwise, use div.approx if fast math is enabled 104 if (getTargetMachine().Options.UnsafeFPMath) 105 return 0; 106 else 107 return 2; 108 } 109 } 110 111 bool NVPTXTargetLowering::usePrecSqrtF32() const { 112 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 113 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 114 return UsePrecSqrtF32; 115 } else { 116 // Otherwise, use sqrt.approx if fast math is enabled 117 return !getTargetMachine().Options.UnsafeFPMath; 118 } 119 } 120 121 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 122 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 123 DenormalMode::PreserveSign; 124 } 125 126 static bool IsPTXVectorType(MVT VT) { 127 switch (VT.SimpleTy) { 128 default: 129 return false; 130 case MVT::v2i1: 131 case MVT::v4i1: 132 case MVT::v2i8: 133 case MVT::v4i8: 134 case MVT::v2i16: 135 case MVT::v4i16: 136 case MVT::v2i32: 137 case MVT::v4i32: 138 case MVT::v2i64: 139 case MVT::v2f16: 140 case MVT::v4f16: 141 case MVT::v8f16: // <4 x f16x2> 142 case MVT::v2bf16: 143 case MVT::v4bf16: 144 case MVT::v8bf16: // <4 x bf16x2> 145 case MVT::v2f32: 146 case MVT::v4f32: 147 case MVT::v2f64: 148 return true; 149 } 150 } 151 152 static bool Isv2f16Orv2bf16Type(EVT VT) { 153 return (VT == MVT::v2f16 || VT == MVT::v2bf16); 154 } 155 156 static bool Isf16Orbf16Type(MVT VT) { 157 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16); 158 } 159 160 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 161 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 162 /// into their primitive components. 163 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 164 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 165 /// LowerCall, and LowerReturn. 166 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 167 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 168 SmallVectorImpl<uint64_t> *Offsets = nullptr, 169 uint64_t StartingOffset = 0) { 170 SmallVector<EVT, 16> TempVTs; 171 SmallVector<uint64_t, 16> TempOffsets; 172 173 // Special case for i128 - decompose to (i64, i64) 174 if (Ty->isIntegerTy(128)) { 175 ValueVTs.push_back(EVT(MVT::i64)); 176 ValueVTs.push_back(EVT(MVT::i64)); 177 178 if (Offsets) { 179 Offsets->push_back(StartingOffset + 0); 180 Offsets->push_back(StartingOffset + 8); 181 } 182 183 return; 184 } 185 186 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 187 if (StructType *STy = dyn_cast<StructType>(Ty)) { 188 auto const *SL = DL.getStructLayout(STy); 189 auto ElementNum = 0; 190 for(auto *EI : STy->elements()) { 191 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 192 StartingOffset + SL->getElementOffset(ElementNum)); 193 ++ElementNum; 194 } 195 return; 196 } 197 198 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 199 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 200 EVT VT = TempVTs[i]; 201 uint64_t Off = TempOffsets[i]; 202 // Split vectors into individual elements, except for v2f16, which 203 // we will pass as a single scalar. 204 if (VT.isVector()) { 205 unsigned NumElts = VT.getVectorNumElements(); 206 EVT EltVT = VT.getVectorElementType(); 207 // Vectors with an even number of f16 elements will be passed to 208 // us as an array of v2f16/v2bf16 elements. We must match this so we 209 // stay in sync with Ins/Outs. 210 if ((Isf16Orbf16Type(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 211 EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16; 212 NumElts /= 2; 213 } 214 for (unsigned j = 0; j != NumElts; ++j) { 215 ValueVTs.push_back(EltVT); 216 if (Offsets) 217 Offsets->push_back(Off + j * EltVT.getStoreSize()); 218 } 219 } else { 220 ValueVTs.push_back(VT); 221 if (Offsets) 222 Offsets->push_back(Off); 223 } 224 } 225 } 226 227 /// PromoteScalarIntegerPTX 228 /// Used to make sure the arguments/returns are suitable for passing 229 /// and promote them to a larger size if they're not. 230 /// 231 /// The promoted type is placed in \p PromoteVT if the function returns true. 232 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 233 if (VT.isScalarInteger()) { 234 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 235 default: 236 llvm_unreachable( 237 "Promotion is not suitable for scalars of size larger than 64-bits"); 238 case 1: 239 *PromotedVT = MVT::i1; 240 break; 241 case 2: 242 case 4: 243 case 8: 244 *PromotedVT = MVT::i8; 245 break; 246 case 16: 247 *PromotedVT = MVT::i16; 248 break; 249 case 32: 250 *PromotedVT = MVT::i32; 251 break; 252 case 64: 253 *PromotedVT = MVT::i64; 254 break; 255 } 256 return EVT(*PromotedVT) != VT; 257 } 258 return false; 259 } 260 261 // Check whether we can merge loads/stores of some of the pieces of a 262 // flattened function parameter or return value into a single vector 263 // load/store. 264 // 265 // The flattened parameter is represented as a list of EVTs and 266 // offsets, and the whole structure is aligned to ParamAlignment. This 267 // function determines whether we can load/store pieces of the 268 // parameter starting at index Idx using a single vectorized op of 269 // size AccessSize. If so, it returns the number of param pieces 270 // covered by the vector op. Otherwise, it returns 1. 271 static unsigned CanMergeParamLoadStoresStartingAt( 272 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 273 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 274 275 // Can't vectorize if param alignment is not sufficient. 276 if (ParamAlignment < AccessSize) 277 return 1; 278 // Can't vectorize if offset is not aligned. 279 if (Offsets[Idx] & (AccessSize - 1)) 280 return 1; 281 282 EVT EltVT = ValueVTs[Idx]; 283 unsigned EltSize = EltVT.getStoreSize(); 284 285 // Element is too large to vectorize. 286 if (EltSize >= AccessSize) 287 return 1; 288 289 unsigned NumElts = AccessSize / EltSize; 290 // Can't vectorize if AccessBytes if not a multiple of EltSize. 291 if (AccessSize != EltSize * NumElts) 292 return 1; 293 294 // We don't have enough elements to vectorize. 295 if (Idx + NumElts > ValueVTs.size()) 296 return 1; 297 298 // PTX ISA can only deal with 2- and 4-element vector ops. 299 if (NumElts != 4 && NumElts != 2) 300 return 1; 301 302 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 303 // Types do not match. 304 if (ValueVTs[j] != EltVT) 305 return 1; 306 307 // Elements are not contiguous. 308 if (Offsets[j] - Offsets[j - 1] != EltSize) 309 return 1; 310 } 311 // OK. We can vectorize ValueVTs[i..i+NumElts) 312 return NumElts; 313 } 314 315 // Flags for tracking per-element vectorization state of loads/stores 316 // of a flattened function parameter or return value. 317 enum ParamVectorizationFlags { 318 PVF_INNER = 0x0, // Middle elements of a vector. 319 PVF_FIRST = 0x1, // First element of the vector. 320 PVF_LAST = 0x2, // Last element of the vector. 321 // Scalar is effectively a 1-element vector. 322 PVF_SCALAR = PVF_FIRST | PVF_LAST 323 }; 324 325 // Computes whether and how we can vectorize the loads/stores of a 326 // flattened function parameter or return value. 327 // 328 // The flattened parameter is represented as the list of ValueVTs and 329 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 330 // of the same size as ValueVTs indicating how each piece should be 331 // loaded/stored (i.e. as a scalar, or as part of a vector 332 // load/store). 333 static SmallVector<ParamVectorizationFlags, 16> 334 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 335 const SmallVectorImpl<uint64_t> &Offsets, 336 Align ParamAlignment, bool IsVAArg = false) { 337 // Set vector size to match ValueVTs and mark all elements as 338 // scalars by default. 339 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 340 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 341 342 if (IsVAArg) 343 return VectorInfo; 344 345 // Check what we can vectorize using 128/64/32-bit accesses. 346 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 347 // Skip elements we've already processed. 348 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 349 for (unsigned AccessSize : {16, 8, 4, 2}) { 350 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 351 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 352 // Mark vectorized elements. 353 switch (NumElts) { 354 default: 355 llvm_unreachable("Unexpected return value"); 356 case 1: 357 // Can't vectorize using this size, try next smaller size. 358 continue; 359 case 2: 360 assert(I + 1 < E && "Not enough elements."); 361 VectorInfo[I] = PVF_FIRST; 362 VectorInfo[I + 1] = PVF_LAST; 363 I += 1; 364 break; 365 case 4: 366 assert(I + 3 < E && "Not enough elements."); 367 VectorInfo[I] = PVF_FIRST; 368 VectorInfo[I + 1] = PVF_INNER; 369 VectorInfo[I + 2] = PVF_INNER; 370 VectorInfo[I + 3] = PVF_LAST; 371 I += 3; 372 break; 373 } 374 // Break out of the inner loop because we've already succeeded 375 // using largest possible AccessSize. 376 break; 377 } 378 } 379 return VectorInfo; 380 } 381 382 // NVPTXTargetLowering Constructor. 383 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 384 const NVPTXSubtarget &STI) 385 : TargetLowering(TM), nvTM(&TM), STI(STI) { 386 // always lower memset, memcpy, and memmove intrinsics to load/store 387 // instructions, rather 388 // then generating calls to memset, mempcy or memmove. 389 MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; 390 MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; 391 MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; 392 393 setBooleanContents(ZeroOrNegativeOneBooleanContent); 394 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 395 396 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 397 // condition branches. 398 setJumpIsExpensive(true); 399 400 // Wide divides are _very_ slow. Try to reduce the width of the divide if 401 // possible. 402 addBypassSlowDiv(64, 32); 403 404 // By default, use the Source scheduling 405 if (sched4reg) 406 setSchedulingPreference(Sched::RegPressure); 407 else 408 setSchedulingPreference(Sched::Source); 409 410 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 411 LegalizeAction NoF16Action) { 412 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 413 }; 414 415 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 416 LegalizeAction NoBF16Action) { 417 bool IsOpSupported = STI.hasBF16Math(); 418 // Few instructions are available on sm_90 only 419 switch(Op) { 420 case ISD::FADD: 421 case ISD::FMUL: 422 case ISD::FSUB: 423 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 424 break; 425 } 426 setOperationAction( 427 Op, VT, IsOpSupported ? Action : NoBF16Action); 428 }; 429 430 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 431 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 432 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 433 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 434 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 435 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 436 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 437 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 438 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 439 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 440 441 // Conversion to/from FP16/FP16x2 is always legal. 442 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal); 443 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal); 444 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 445 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 446 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 447 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 448 449 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 450 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 451 452 // Conversion to/from BFP16/BFP16x2 is always legal. 453 setOperationAction(ISD::SINT_TO_FP, MVT::bf16, Legal); 454 setOperationAction(ISD::FP_TO_SINT, MVT::bf16, Legal); 455 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 456 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 457 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 458 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 459 460 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 461 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 462 // Operations not directly supported by NVPTX. 463 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 464 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { 465 setOperationAction(ISD::SELECT_CC, VT, Expand); 466 setOperationAction(ISD::BR_CC, VT, Expand); 467 } 468 469 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 470 // For others we will expand to a SHL/SRA pair. 471 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 472 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 473 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 474 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 475 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 476 477 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 478 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 479 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 480 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 481 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 482 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 483 484 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 485 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 486 487 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 488 // that don't have h/w rotation we lower them to multi-instruction assembly. 489 // See ROT*_sw in NVPTXIntrInfo.td 490 setOperationAction(ISD::ROTL, MVT::i64, Legal); 491 setOperationAction(ISD::ROTR, MVT::i64, Legal); 492 setOperationAction(ISD::ROTL, MVT::i32, Legal); 493 setOperationAction(ISD::ROTR, MVT::i32, Legal); 494 495 setOperationAction(ISD::ROTL, MVT::i16, Expand); 496 setOperationAction(ISD::ROTR, MVT::i16, Expand); 497 setOperationAction(ISD::ROTL, MVT::i8, Expand); 498 setOperationAction(ISD::ROTR, MVT::i8, Expand); 499 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 500 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 501 setOperationAction(ISD::BSWAP, MVT::i64, Expand); 502 503 // Indirect branch is not supported. 504 // This also disables Jump Table creation. 505 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 506 setOperationAction(ISD::BRIND, MVT::Other, Expand); 507 508 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 509 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 510 511 // We want to legalize constant related memmove and memcopy 512 // intrinsics. 513 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 514 515 // Turn FP extload into load/fpextend 516 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 517 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 518 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 519 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 520 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 521 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 522 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 523 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 524 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 525 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 526 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 527 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 528 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 529 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 530 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 531 // Turn FP truncstore into trunc + store. 532 // FIXME: vector types should also be expanded 533 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 534 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 535 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 536 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 537 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 538 539 // PTX does not support load / store predicate registers 540 setOperationAction(ISD::LOAD, MVT::i1, Custom); 541 setOperationAction(ISD::STORE, MVT::i1, Custom); 542 543 for (MVT VT : MVT::integer_valuetypes()) { 544 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 545 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 546 setTruncStoreAction(VT, MVT::i1, Expand); 547 } 548 549 // This is legal in NVPTX 550 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 551 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 552 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 553 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 554 555 // TRAP can be lowered to PTX trap 556 setOperationAction(ISD::TRAP, MVT::Other, Legal); 557 558 // Register custom handling for vector loads/stores 559 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 560 if (IsPTXVectorType(VT)) { 561 setOperationAction(ISD::LOAD, VT, Custom); 562 setOperationAction(ISD::STORE, VT, Custom); 563 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 564 } 565 } 566 567 // Support varargs. 568 setOperationAction(ISD::VASTART, MVT::Other, Custom); 569 setOperationAction(ISD::VAARG, MVT::Other, Custom); 570 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 571 setOperationAction(ISD::VAEND, MVT::Other, Expand); 572 573 // Custom handling for i8 intrinsics 574 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 575 576 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 577 setOperationAction(ISD::ABS, Ty, Legal); 578 setOperationAction(ISD::SMIN, Ty, Legal); 579 setOperationAction(ISD::SMAX, Ty, Legal); 580 setOperationAction(ISD::UMIN, Ty, Legal); 581 setOperationAction(ISD::UMAX, Ty, Legal); 582 583 setOperationAction(ISD::CTPOP, Ty, Legal); 584 setOperationAction(ISD::CTLZ, Ty, Legal); 585 } 586 587 setOperationAction(ISD::ADDC, MVT::i32, Legal); 588 setOperationAction(ISD::ADDE, MVT::i32, Legal); 589 setOperationAction(ISD::SUBC, MVT::i32, Legal); 590 setOperationAction(ISD::SUBE, MVT::i32, Legal); 591 if (STI.getPTXVersion() >= 43) { 592 setOperationAction(ISD::ADDC, MVT::i64, Legal); 593 setOperationAction(ISD::ADDE, MVT::i64, Legal); 594 setOperationAction(ISD::SUBC, MVT::i64, Legal); 595 setOperationAction(ISD::SUBE, MVT::i64, Legal); 596 } 597 598 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 599 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 600 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 601 602 // PTX does not directly support SELP of i1, so promote to i32 first 603 setOperationAction(ISD::SELECT, MVT::i1, Custom); 604 605 // PTX cannot multiply two i64s in a single instruction. 606 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 607 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 608 609 // We have some custom DAG combine patterns for these nodes 610 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::FADD, ISD::MUL, ISD::SHL, 611 ISD::SREM, ISD::UREM}); 612 613 // setcc for f16x2 and bf16x2 needs special handling to prevent 614 // legalizer's attempt to scalarize it due to v2i1 not being legal. 615 if (STI.allowFP16Math() || STI.hasBF16Math()) 616 setTargetDAGCombine(ISD::SETCC); 617 618 // Promote fp16 arithmetic if fp16 hardware isn't available or the 619 // user passed --nvptx-no-fp16-math. The flag is useful because, 620 // although sm_53+ GPUs have some sort of FP16 support in 621 // hardware, only sm_53 and sm_60 have full implementation. Others 622 // only have token amount of hardware and are likely to run faster 623 // by using fp32 units instead. 624 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 625 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 626 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 627 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 628 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 629 // bf16 must be promoted to f32. 630 if (getOperationAction(Op, MVT::bf16) == Promote) 631 AddPromotedToType(Op, MVT::bf16, MVT::f32); 632 } 633 634 // f16/f16x2 neg was introduced in PTX 60, SM_53. 635 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 636 STI.getPTXVersion() >= 60 && 637 STI.allowFP16Math(); 638 for (const auto &VT : {MVT::f16, MVT::v2f16}) 639 setOperationAction(ISD::FNEG, VT, 640 IsFP16FP16x2NegAvailable ? Legal : Expand); 641 642 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 643 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 644 // (would be) Library functions. 645 646 // These map to conversion instructions for scalar FP types. 647 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 648 ISD::FROUNDEVEN, ISD::FTRUNC}) { 649 setOperationAction(Op, MVT::bf16, Legal); 650 setOperationAction(Op, MVT::f16, Legal); 651 setOperationAction(Op, MVT::f32, Legal); 652 setOperationAction(Op, MVT::f64, Legal); 653 setOperationAction(Op, MVT::v2f16, Expand); 654 setOperationAction(Op, MVT::v2bf16, Expand); 655 } 656 657 setOperationAction(ISD::FROUND, MVT::f16, Promote); 658 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 659 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 660 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 661 setOperationAction(ISD::FROUND, MVT::f32, Custom); 662 setOperationAction(ISD::FROUND, MVT::f64, Custom); 663 664 665 // 'Expand' implements FCOPYSIGN without calling an external library. 666 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 667 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 668 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 669 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 670 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 671 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 672 673 // These map to corresponding instructions for f32/f64. f16 must be 674 // promoted to f32. v2f16 is expanded to f16, which is then promoted 675 // to f32. 676 for (const auto &Op : 677 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) { 678 setOperationAction(Op, MVT::f16, Promote); 679 setOperationAction(Op, MVT::bf16, Promote); 680 setOperationAction(Op, MVT::f32, Legal); 681 setOperationAction(Op, MVT::f64, Legal); 682 setOperationAction(Op, MVT::v2f16, Expand); 683 setOperationAction(Op, MVT::v2bf16, Expand); 684 } 685 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 686 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 687 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 688 return IsAtLeastSm80 ? Legal : NotSm80Action; 689 }; 690 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 691 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 692 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 693 setOperationAction(Op, MVT::f32, Legal); 694 setOperationAction(Op, MVT::f64, Legal); 695 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 696 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 697 } 698 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 699 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 700 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 701 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 702 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 703 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 704 } 705 706 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 707 // No FPOW or FREM in PTX. 708 709 // Now deduce the information based on the above mentioned 710 // actions 711 computeRegisterProperties(STI.getRegisterInfo()); 712 713 setMinCmpXchgSizeInBits(32); 714 } 715 716 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 717 switch ((NVPTXISD::NodeType)Opcode) { 718 case NVPTXISD::FIRST_NUMBER: 719 break; 720 case NVPTXISD::CALL: 721 return "NVPTXISD::CALL"; 722 case NVPTXISD::RET_GLUE: 723 return "NVPTXISD::RET_GLUE"; 724 case NVPTXISD::LOAD_PARAM: 725 return "NVPTXISD::LOAD_PARAM"; 726 case NVPTXISD::Wrapper: 727 return "NVPTXISD::Wrapper"; 728 case NVPTXISD::DeclareParam: 729 return "NVPTXISD::DeclareParam"; 730 case NVPTXISD::DeclareScalarParam: 731 return "NVPTXISD::DeclareScalarParam"; 732 case NVPTXISD::DeclareRet: 733 return "NVPTXISD::DeclareRet"; 734 case NVPTXISD::DeclareScalarRet: 735 return "NVPTXISD::DeclareScalarRet"; 736 case NVPTXISD::DeclareRetParam: 737 return "NVPTXISD::DeclareRetParam"; 738 case NVPTXISD::PrintCall: 739 return "NVPTXISD::PrintCall"; 740 case NVPTXISD::PrintConvergentCall: 741 return "NVPTXISD::PrintConvergentCall"; 742 case NVPTXISD::PrintCallUni: 743 return "NVPTXISD::PrintCallUni"; 744 case NVPTXISD::PrintConvergentCallUni: 745 return "NVPTXISD::PrintConvergentCallUni"; 746 case NVPTXISD::LoadParam: 747 return "NVPTXISD::LoadParam"; 748 case NVPTXISD::LoadParamV2: 749 return "NVPTXISD::LoadParamV2"; 750 case NVPTXISD::LoadParamV4: 751 return "NVPTXISD::LoadParamV4"; 752 case NVPTXISD::StoreParam: 753 return "NVPTXISD::StoreParam"; 754 case NVPTXISD::StoreParamV2: 755 return "NVPTXISD::StoreParamV2"; 756 case NVPTXISD::StoreParamV4: 757 return "NVPTXISD::StoreParamV4"; 758 case NVPTXISD::StoreParamS32: 759 return "NVPTXISD::StoreParamS32"; 760 case NVPTXISD::StoreParamU32: 761 return "NVPTXISD::StoreParamU32"; 762 case NVPTXISD::CallArgBegin: 763 return "NVPTXISD::CallArgBegin"; 764 case NVPTXISD::CallArg: 765 return "NVPTXISD::CallArg"; 766 case NVPTXISD::LastCallArg: 767 return "NVPTXISD::LastCallArg"; 768 case NVPTXISD::CallArgEnd: 769 return "NVPTXISD::CallArgEnd"; 770 case NVPTXISD::CallVoid: 771 return "NVPTXISD::CallVoid"; 772 case NVPTXISD::CallVal: 773 return "NVPTXISD::CallVal"; 774 case NVPTXISD::CallSymbol: 775 return "NVPTXISD::CallSymbol"; 776 case NVPTXISD::Prototype: 777 return "NVPTXISD::Prototype"; 778 case NVPTXISD::MoveParam: 779 return "NVPTXISD::MoveParam"; 780 case NVPTXISD::StoreRetval: 781 return "NVPTXISD::StoreRetval"; 782 case NVPTXISD::StoreRetvalV2: 783 return "NVPTXISD::StoreRetvalV2"; 784 case NVPTXISD::StoreRetvalV4: 785 return "NVPTXISD::StoreRetvalV4"; 786 case NVPTXISD::PseudoUseParam: 787 return "NVPTXISD::PseudoUseParam"; 788 case NVPTXISD::RETURN: 789 return "NVPTXISD::RETURN"; 790 case NVPTXISD::CallSeqBegin: 791 return "NVPTXISD::CallSeqBegin"; 792 case NVPTXISD::CallSeqEnd: 793 return "NVPTXISD::CallSeqEnd"; 794 case NVPTXISD::CallPrototype: 795 return "NVPTXISD::CallPrototype"; 796 case NVPTXISD::ProxyReg: 797 return "NVPTXISD::ProxyReg"; 798 case NVPTXISD::LoadV2: 799 return "NVPTXISD::LoadV2"; 800 case NVPTXISD::LoadV4: 801 return "NVPTXISD::LoadV4"; 802 case NVPTXISD::LDGV2: 803 return "NVPTXISD::LDGV2"; 804 case NVPTXISD::LDGV4: 805 return "NVPTXISD::LDGV4"; 806 case NVPTXISD::LDUV2: 807 return "NVPTXISD::LDUV2"; 808 case NVPTXISD::LDUV4: 809 return "NVPTXISD::LDUV4"; 810 case NVPTXISD::StoreV2: 811 return "NVPTXISD::StoreV2"; 812 case NVPTXISD::StoreV4: 813 return "NVPTXISD::StoreV4"; 814 case NVPTXISD::FUN_SHFL_CLAMP: 815 return "NVPTXISD::FUN_SHFL_CLAMP"; 816 case NVPTXISD::FUN_SHFR_CLAMP: 817 return "NVPTXISD::FUN_SHFR_CLAMP"; 818 case NVPTXISD::IMAD: 819 return "NVPTXISD::IMAD"; 820 case NVPTXISD::SETP_F16X2: 821 return "NVPTXISD::SETP_F16X2"; 822 case NVPTXISD::Dummy: 823 return "NVPTXISD::Dummy"; 824 case NVPTXISD::MUL_WIDE_SIGNED: 825 return "NVPTXISD::MUL_WIDE_SIGNED"; 826 case NVPTXISD::MUL_WIDE_UNSIGNED: 827 return "NVPTXISD::MUL_WIDE_UNSIGNED"; 828 case NVPTXISD::Tex1DFloatS32: return "NVPTXISD::Tex1DFloatS32"; 829 case NVPTXISD::Tex1DFloatFloat: return "NVPTXISD::Tex1DFloatFloat"; 830 case NVPTXISD::Tex1DFloatFloatLevel: 831 return "NVPTXISD::Tex1DFloatFloatLevel"; 832 case NVPTXISD::Tex1DFloatFloatGrad: 833 return "NVPTXISD::Tex1DFloatFloatGrad"; 834 case NVPTXISD::Tex1DS32S32: return "NVPTXISD::Tex1DS32S32"; 835 case NVPTXISD::Tex1DS32Float: return "NVPTXISD::Tex1DS32Float"; 836 case NVPTXISD::Tex1DS32FloatLevel: 837 return "NVPTXISD::Tex1DS32FloatLevel"; 838 case NVPTXISD::Tex1DS32FloatGrad: 839 return "NVPTXISD::Tex1DS32FloatGrad"; 840 case NVPTXISD::Tex1DU32S32: return "NVPTXISD::Tex1DU32S32"; 841 case NVPTXISD::Tex1DU32Float: return "NVPTXISD::Tex1DU32Float"; 842 case NVPTXISD::Tex1DU32FloatLevel: 843 return "NVPTXISD::Tex1DU32FloatLevel"; 844 case NVPTXISD::Tex1DU32FloatGrad: 845 return "NVPTXISD::Tex1DU32FloatGrad"; 846 case NVPTXISD::Tex1DArrayFloatS32: return "NVPTXISD::Tex1DArrayFloatS32"; 847 case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat"; 848 case NVPTXISD::Tex1DArrayFloatFloatLevel: 849 return "NVPTXISD::Tex1DArrayFloatFloatLevel"; 850 case NVPTXISD::Tex1DArrayFloatFloatGrad: 851 return "NVPTXISD::Tex1DArrayFloatFloatGrad"; 852 case NVPTXISD::Tex1DArrayS32S32: return "NVPTXISD::Tex1DArrayS32S32"; 853 case NVPTXISD::Tex1DArrayS32Float: return "NVPTXISD::Tex1DArrayS32Float"; 854 case NVPTXISD::Tex1DArrayS32FloatLevel: 855 return "NVPTXISD::Tex1DArrayS32FloatLevel"; 856 case NVPTXISD::Tex1DArrayS32FloatGrad: 857 return "NVPTXISD::Tex1DArrayS32FloatGrad"; 858 case NVPTXISD::Tex1DArrayU32S32: return "NVPTXISD::Tex1DArrayU32S32"; 859 case NVPTXISD::Tex1DArrayU32Float: return "NVPTXISD::Tex1DArrayU32Float"; 860 case NVPTXISD::Tex1DArrayU32FloatLevel: 861 return "NVPTXISD::Tex1DArrayU32FloatLevel"; 862 case NVPTXISD::Tex1DArrayU32FloatGrad: 863 return "NVPTXISD::Tex1DArrayU32FloatGrad"; 864 case NVPTXISD::Tex2DFloatS32: return "NVPTXISD::Tex2DFloatS32"; 865 case NVPTXISD::Tex2DFloatFloat: return "NVPTXISD::Tex2DFloatFloat"; 866 case NVPTXISD::Tex2DFloatFloatLevel: 867 return "NVPTXISD::Tex2DFloatFloatLevel"; 868 case NVPTXISD::Tex2DFloatFloatGrad: 869 return "NVPTXISD::Tex2DFloatFloatGrad"; 870 case NVPTXISD::Tex2DS32S32: return "NVPTXISD::Tex2DS32S32"; 871 case NVPTXISD::Tex2DS32Float: return "NVPTXISD::Tex2DS32Float"; 872 case NVPTXISD::Tex2DS32FloatLevel: 873 return "NVPTXISD::Tex2DS32FloatLevel"; 874 case NVPTXISD::Tex2DS32FloatGrad: 875 return "NVPTXISD::Tex2DS32FloatGrad"; 876 case NVPTXISD::Tex2DU32S32: return "NVPTXISD::Tex2DU32S32"; 877 case NVPTXISD::Tex2DU32Float: return "NVPTXISD::Tex2DU32Float"; 878 case NVPTXISD::Tex2DU32FloatLevel: 879 return "NVPTXISD::Tex2DU32FloatLevel"; 880 case NVPTXISD::Tex2DU32FloatGrad: 881 return "NVPTXISD::Tex2DU32FloatGrad"; 882 case NVPTXISD::Tex2DArrayFloatS32: return "NVPTXISD::Tex2DArrayFloatS32"; 883 case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat"; 884 case NVPTXISD::Tex2DArrayFloatFloatLevel: 885 return "NVPTXISD::Tex2DArrayFloatFloatLevel"; 886 case NVPTXISD::Tex2DArrayFloatFloatGrad: 887 return "NVPTXISD::Tex2DArrayFloatFloatGrad"; 888 case NVPTXISD::Tex2DArrayS32S32: return "NVPTXISD::Tex2DArrayS32S32"; 889 case NVPTXISD::Tex2DArrayS32Float: return "NVPTXISD::Tex2DArrayS32Float"; 890 case NVPTXISD::Tex2DArrayS32FloatLevel: 891 return "NVPTXISD::Tex2DArrayS32FloatLevel"; 892 case NVPTXISD::Tex2DArrayS32FloatGrad: 893 return "NVPTXISD::Tex2DArrayS32FloatGrad"; 894 case NVPTXISD::Tex2DArrayU32S32: return "NVPTXISD::Tex2DArrayU32S32"; 895 case NVPTXISD::Tex2DArrayU32Float: return "NVPTXISD::Tex2DArrayU32Float"; 896 case NVPTXISD::Tex2DArrayU32FloatLevel: 897 return "NVPTXISD::Tex2DArrayU32FloatLevel"; 898 case NVPTXISD::Tex2DArrayU32FloatGrad: 899 return "NVPTXISD::Tex2DArrayU32FloatGrad"; 900 case NVPTXISD::Tex3DFloatS32: return "NVPTXISD::Tex3DFloatS32"; 901 case NVPTXISD::Tex3DFloatFloat: return "NVPTXISD::Tex3DFloatFloat"; 902 case NVPTXISD::Tex3DFloatFloatLevel: 903 return "NVPTXISD::Tex3DFloatFloatLevel"; 904 case NVPTXISD::Tex3DFloatFloatGrad: 905 return "NVPTXISD::Tex3DFloatFloatGrad"; 906 case NVPTXISD::Tex3DS32S32: return "NVPTXISD::Tex3DS32S32"; 907 case NVPTXISD::Tex3DS32Float: return "NVPTXISD::Tex3DS32Float"; 908 case NVPTXISD::Tex3DS32FloatLevel: 909 return "NVPTXISD::Tex3DS32FloatLevel"; 910 case NVPTXISD::Tex3DS32FloatGrad: 911 return "NVPTXISD::Tex3DS32FloatGrad"; 912 case NVPTXISD::Tex3DU32S32: return "NVPTXISD::Tex3DU32S32"; 913 case NVPTXISD::Tex3DU32Float: return "NVPTXISD::Tex3DU32Float"; 914 case NVPTXISD::Tex3DU32FloatLevel: 915 return "NVPTXISD::Tex3DU32FloatLevel"; 916 case NVPTXISD::Tex3DU32FloatGrad: 917 return "NVPTXISD::Tex3DU32FloatGrad"; 918 case NVPTXISD::TexCubeFloatFloat: return "NVPTXISD::TexCubeFloatFloat"; 919 case NVPTXISD::TexCubeFloatFloatLevel: 920 return "NVPTXISD::TexCubeFloatFloatLevel"; 921 case NVPTXISD::TexCubeS32Float: return "NVPTXISD::TexCubeS32Float"; 922 case NVPTXISD::TexCubeS32FloatLevel: 923 return "NVPTXISD::TexCubeS32FloatLevel"; 924 case NVPTXISD::TexCubeU32Float: return "NVPTXISD::TexCubeU32Float"; 925 case NVPTXISD::TexCubeU32FloatLevel: 926 return "NVPTXISD::TexCubeU32FloatLevel"; 927 case NVPTXISD::TexCubeArrayFloatFloat: 928 return "NVPTXISD::TexCubeArrayFloatFloat"; 929 case NVPTXISD::TexCubeArrayFloatFloatLevel: 930 return "NVPTXISD::TexCubeArrayFloatFloatLevel"; 931 case NVPTXISD::TexCubeArrayS32Float: 932 return "NVPTXISD::TexCubeArrayS32Float"; 933 case NVPTXISD::TexCubeArrayS32FloatLevel: 934 return "NVPTXISD::TexCubeArrayS32FloatLevel"; 935 case NVPTXISD::TexCubeArrayU32Float: 936 return "NVPTXISD::TexCubeArrayU32Float"; 937 case NVPTXISD::TexCubeArrayU32FloatLevel: 938 return "NVPTXISD::TexCubeArrayU32FloatLevel"; 939 case NVPTXISD::Tld4R2DFloatFloat: 940 return "NVPTXISD::Tld4R2DFloatFloat"; 941 case NVPTXISD::Tld4G2DFloatFloat: 942 return "NVPTXISD::Tld4G2DFloatFloat"; 943 case NVPTXISD::Tld4B2DFloatFloat: 944 return "NVPTXISD::Tld4B2DFloatFloat"; 945 case NVPTXISD::Tld4A2DFloatFloat: 946 return "NVPTXISD::Tld4A2DFloatFloat"; 947 case NVPTXISD::Tld4R2DS64Float: 948 return "NVPTXISD::Tld4R2DS64Float"; 949 case NVPTXISD::Tld4G2DS64Float: 950 return "NVPTXISD::Tld4G2DS64Float"; 951 case NVPTXISD::Tld4B2DS64Float: 952 return "NVPTXISD::Tld4B2DS64Float"; 953 case NVPTXISD::Tld4A2DS64Float: 954 return "NVPTXISD::Tld4A2DS64Float"; 955 case NVPTXISD::Tld4R2DU64Float: 956 return "NVPTXISD::Tld4R2DU64Float"; 957 case NVPTXISD::Tld4G2DU64Float: 958 return "NVPTXISD::Tld4G2DU64Float"; 959 case NVPTXISD::Tld4B2DU64Float: 960 return "NVPTXISD::Tld4B2DU64Float"; 961 case NVPTXISD::Tld4A2DU64Float: 962 return "NVPTXISD::Tld4A2DU64Float"; 963 964 case NVPTXISD::TexUnified1DFloatS32: 965 return "NVPTXISD::TexUnified1DFloatS32"; 966 case NVPTXISD::TexUnified1DFloatFloat: 967 return "NVPTXISD::TexUnified1DFloatFloat"; 968 case NVPTXISD::TexUnified1DFloatFloatLevel: 969 return "NVPTXISD::TexUnified1DFloatFloatLevel"; 970 case NVPTXISD::TexUnified1DFloatFloatGrad: 971 return "NVPTXISD::TexUnified1DFloatFloatGrad"; 972 case NVPTXISD::TexUnified1DS32S32: 973 return "NVPTXISD::TexUnified1DS32S32"; 974 case NVPTXISD::TexUnified1DS32Float: 975 return "NVPTXISD::TexUnified1DS32Float"; 976 case NVPTXISD::TexUnified1DS32FloatLevel: 977 return "NVPTXISD::TexUnified1DS32FloatLevel"; 978 case NVPTXISD::TexUnified1DS32FloatGrad: 979 return "NVPTXISD::TexUnified1DS32FloatGrad"; 980 case NVPTXISD::TexUnified1DU32S32: 981 return "NVPTXISD::TexUnified1DU32S32"; 982 case NVPTXISD::TexUnified1DU32Float: 983 return "NVPTXISD::TexUnified1DU32Float"; 984 case NVPTXISD::TexUnified1DU32FloatLevel: 985 return "NVPTXISD::TexUnified1DU32FloatLevel"; 986 case NVPTXISD::TexUnified1DU32FloatGrad: 987 return "NVPTXISD::TexUnified1DU32FloatGrad"; 988 case NVPTXISD::TexUnified1DArrayFloatS32: 989 return "NVPTXISD::TexUnified1DArrayFloatS32"; 990 case NVPTXISD::TexUnified1DArrayFloatFloat: 991 return "NVPTXISD::TexUnified1DArrayFloatFloat"; 992 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 993 return "NVPTXISD::TexUnified1DArrayFloatFloatLevel"; 994 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 995 return "NVPTXISD::TexUnified1DArrayFloatFloatGrad"; 996 case NVPTXISD::TexUnified1DArrayS32S32: 997 return "NVPTXISD::TexUnified1DArrayS32S32"; 998 case NVPTXISD::TexUnified1DArrayS32Float: 999 return "NVPTXISD::TexUnified1DArrayS32Float"; 1000 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 1001 return "NVPTXISD::TexUnified1DArrayS32FloatLevel"; 1002 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 1003 return "NVPTXISD::TexUnified1DArrayS32FloatGrad"; 1004 case NVPTXISD::TexUnified1DArrayU32S32: 1005 return "NVPTXISD::TexUnified1DArrayU32S32"; 1006 case NVPTXISD::TexUnified1DArrayU32Float: 1007 return "NVPTXISD::TexUnified1DArrayU32Float"; 1008 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 1009 return "NVPTXISD::TexUnified1DArrayU32FloatLevel"; 1010 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 1011 return "NVPTXISD::TexUnified1DArrayU32FloatGrad"; 1012 case NVPTXISD::TexUnified2DFloatS32: 1013 return "NVPTXISD::TexUnified2DFloatS32"; 1014 case NVPTXISD::TexUnified2DFloatFloat: 1015 return "NVPTXISD::TexUnified2DFloatFloat"; 1016 case NVPTXISD::TexUnified2DFloatFloatLevel: 1017 return "NVPTXISD::TexUnified2DFloatFloatLevel"; 1018 case NVPTXISD::TexUnified2DFloatFloatGrad: 1019 return "NVPTXISD::TexUnified2DFloatFloatGrad"; 1020 case NVPTXISD::TexUnified2DS32S32: 1021 return "NVPTXISD::TexUnified2DS32S32"; 1022 case NVPTXISD::TexUnified2DS32Float: 1023 return "NVPTXISD::TexUnified2DS32Float"; 1024 case NVPTXISD::TexUnified2DS32FloatLevel: 1025 return "NVPTXISD::TexUnified2DS32FloatLevel"; 1026 case NVPTXISD::TexUnified2DS32FloatGrad: 1027 return "NVPTXISD::TexUnified2DS32FloatGrad"; 1028 case NVPTXISD::TexUnified2DU32S32: 1029 return "NVPTXISD::TexUnified2DU32S32"; 1030 case NVPTXISD::TexUnified2DU32Float: 1031 return "NVPTXISD::TexUnified2DU32Float"; 1032 case NVPTXISD::TexUnified2DU32FloatLevel: 1033 return "NVPTXISD::TexUnified2DU32FloatLevel"; 1034 case NVPTXISD::TexUnified2DU32FloatGrad: 1035 return "NVPTXISD::TexUnified2DU32FloatGrad"; 1036 case NVPTXISD::TexUnified2DArrayFloatS32: 1037 return "NVPTXISD::TexUnified2DArrayFloatS32"; 1038 case NVPTXISD::TexUnified2DArrayFloatFloat: 1039 return "NVPTXISD::TexUnified2DArrayFloatFloat"; 1040 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 1041 return "NVPTXISD::TexUnified2DArrayFloatFloatLevel"; 1042 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 1043 return "NVPTXISD::TexUnified2DArrayFloatFloatGrad"; 1044 case NVPTXISD::TexUnified2DArrayS32S32: 1045 return "NVPTXISD::TexUnified2DArrayS32S32"; 1046 case NVPTXISD::TexUnified2DArrayS32Float: 1047 return "NVPTXISD::TexUnified2DArrayS32Float"; 1048 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 1049 return "NVPTXISD::TexUnified2DArrayS32FloatLevel"; 1050 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 1051 return "NVPTXISD::TexUnified2DArrayS32FloatGrad"; 1052 case NVPTXISD::TexUnified2DArrayU32S32: 1053 return "NVPTXISD::TexUnified2DArrayU32S32"; 1054 case NVPTXISD::TexUnified2DArrayU32Float: 1055 return "NVPTXISD::TexUnified2DArrayU32Float"; 1056 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 1057 return "NVPTXISD::TexUnified2DArrayU32FloatLevel"; 1058 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 1059 return "NVPTXISD::TexUnified2DArrayU32FloatGrad"; 1060 case NVPTXISD::TexUnified3DFloatS32: 1061 return "NVPTXISD::TexUnified3DFloatS32"; 1062 case NVPTXISD::TexUnified3DFloatFloat: 1063 return "NVPTXISD::TexUnified3DFloatFloat"; 1064 case NVPTXISD::TexUnified3DFloatFloatLevel: 1065 return "NVPTXISD::TexUnified3DFloatFloatLevel"; 1066 case NVPTXISD::TexUnified3DFloatFloatGrad: 1067 return "NVPTXISD::TexUnified3DFloatFloatGrad"; 1068 case NVPTXISD::TexUnified3DS32S32: 1069 return "NVPTXISD::TexUnified3DS32S32"; 1070 case NVPTXISD::TexUnified3DS32Float: 1071 return "NVPTXISD::TexUnified3DS32Float"; 1072 case NVPTXISD::TexUnified3DS32FloatLevel: 1073 return "NVPTXISD::TexUnified3DS32FloatLevel"; 1074 case NVPTXISD::TexUnified3DS32FloatGrad: 1075 return "NVPTXISD::TexUnified3DS32FloatGrad"; 1076 case NVPTXISD::TexUnified3DU32S32: 1077 return "NVPTXISD::TexUnified3DU32S32"; 1078 case NVPTXISD::TexUnified3DU32Float: 1079 return "NVPTXISD::TexUnified3DU32Float"; 1080 case NVPTXISD::TexUnified3DU32FloatLevel: 1081 return "NVPTXISD::TexUnified3DU32FloatLevel"; 1082 case NVPTXISD::TexUnified3DU32FloatGrad: 1083 return "NVPTXISD::TexUnified3DU32FloatGrad"; 1084 case NVPTXISD::TexUnifiedCubeFloatFloat: 1085 return "NVPTXISD::TexUnifiedCubeFloatFloat"; 1086 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 1087 return "NVPTXISD::TexUnifiedCubeFloatFloatLevel"; 1088 case NVPTXISD::TexUnifiedCubeS32Float: 1089 return "NVPTXISD::TexUnifiedCubeS32Float"; 1090 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 1091 return "NVPTXISD::TexUnifiedCubeS32FloatLevel"; 1092 case NVPTXISD::TexUnifiedCubeU32Float: 1093 return "NVPTXISD::TexUnifiedCubeU32Float"; 1094 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 1095 return "NVPTXISD::TexUnifiedCubeU32FloatLevel"; 1096 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 1097 return "NVPTXISD::TexUnifiedCubeArrayFloatFloat"; 1098 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 1099 return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel"; 1100 case NVPTXISD::TexUnifiedCubeArrayS32Float: 1101 return "NVPTXISD::TexUnifiedCubeArrayS32Float"; 1102 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 1103 return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel"; 1104 case NVPTXISD::TexUnifiedCubeArrayU32Float: 1105 return "NVPTXISD::TexUnifiedCubeArrayU32Float"; 1106 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 1107 return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel"; 1108 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 1109 return "NVPTXISD::Tld4UnifiedR2DFloatFloat"; 1110 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 1111 return "NVPTXISD::Tld4UnifiedG2DFloatFloat"; 1112 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 1113 return "NVPTXISD::Tld4UnifiedB2DFloatFloat"; 1114 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 1115 return "NVPTXISD::Tld4UnifiedA2DFloatFloat"; 1116 case NVPTXISD::Tld4UnifiedR2DS64Float: 1117 return "NVPTXISD::Tld4UnifiedR2DS64Float"; 1118 case NVPTXISD::Tld4UnifiedG2DS64Float: 1119 return "NVPTXISD::Tld4UnifiedG2DS64Float"; 1120 case NVPTXISD::Tld4UnifiedB2DS64Float: 1121 return "NVPTXISD::Tld4UnifiedB2DS64Float"; 1122 case NVPTXISD::Tld4UnifiedA2DS64Float: 1123 return "NVPTXISD::Tld4UnifiedA2DS64Float"; 1124 case NVPTXISD::Tld4UnifiedR2DU64Float: 1125 return "NVPTXISD::Tld4UnifiedR2DU64Float"; 1126 case NVPTXISD::Tld4UnifiedG2DU64Float: 1127 return "NVPTXISD::Tld4UnifiedG2DU64Float"; 1128 case NVPTXISD::Tld4UnifiedB2DU64Float: 1129 return "NVPTXISD::Tld4UnifiedB2DU64Float"; 1130 case NVPTXISD::Tld4UnifiedA2DU64Float: 1131 return "NVPTXISD::Tld4UnifiedA2DU64Float"; 1132 1133 case NVPTXISD::Suld1DI8Clamp: return "NVPTXISD::Suld1DI8Clamp"; 1134 case NVPTXISD::Suld1DI16Clamp: return "NVPTXISD::Suld1DI16Clamp"; 1135 case NVPTXISD::Suld1DI32Clamp: return "NVPTXISD::Suld1DI32Clamp"; 1136 case NVPTXISD::Suld1DI64Clamp: return "NVPTXISD::Suld1DI64Clamp"; 1137 case NVPTXISD::Suld1DV2I8Clamp: return "NVPTXISD::Suld1DV2I8Clamp"; 1138 case NVPTXISD::Suld1DV2I16Clamp: return "NVPTXISD::Suld1DV2I16Clamp"; 1139 case NVPTXISD::Suld1DV2I32Clamp: return "NVPTXISD::Suld1DV2I32Clamp"; 1140 case NVPTXISD::Suld1DV2I64Clamp: return "NVPTXISD::Suld1DV2I64Clamp"; 1141 case NVPTXISD::Suld1DV4I8Clamp: return "NVPTXISD::Suld1DV4I8Clamp"; 1142 case NVPTXISD::Suld1DV4I16Clamp: return "NVPTXISD::Suld1DV4I16Clamp"; 1143 case NVPTXISD::Suld1DV4I32Clamp: return "NVPTXISD::Suld1DV4I32Clamp"; 1144 1145 case NVPTXISD::Suld1DArrayI8Clamp: return "NVPTXISD::Suld1DArrayI8Clamp"; 1146 case NVPTXISD::Suld1DArrayI16Clamp: return "NVPTXISD::Suld1DArrayI16Clamp"; 1147 case NVPTXISD::Suld1DArrayI32Clamp: return "NVPTXISD::Suld1DArrayI32Clamp"; 1148 case NVPTXISD::Suld1DArrayI64Clamp: return "NVPTXISD::Suld1DArrayI64Clamp"; 1149 case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp"; 1150 case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp"; 1151 case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp"; 1152 case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp"; 1153 case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp"; 1154 case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp"; 1155 case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp"; 1156 1157 case NVPTXISD::Suld2DI8Clamp: return "NVPTXISD::Suld2DI8Clamp"; 1158 case NVPTXISD::Suld2DI16Clamp: return "NVPTXISD::Suld2DI16Clamp"; 1159 case NVPTXISD::Suld2DI32Clamp: return "NVPTXISD::Suld2DI32Clamp"; 1160 case NVPTXISD::Suld2DI64Clamp: return "NVPTXISD::Suld2DI64Clamp"; 1161 case NVPTXISD::Suld2DV2I8Clamp: return "NVPTXISD::Suld2DV2I8Clamp"; 1162 case NVPTXISD::Suld2DV2I16Clamp: return "NVPTXISD::Suld2DV2I16Clamp"; 1163 case NVPTXISD::Suld2DV2I32Clamp: return "NVPTXISD::Suld2DV2I32Clamp"; 1164 case NVPTXISD::Suld2DV2I64Clamp: return "NVPTXISD::Suld2DV2I64Clamp"; 1165 case NVPTXISD::Suld2DV4I8Clamp: return "NVPTXISD::Suld2DV4I8Clamp"; 1166 case NVPTXISD::Suld2DV4I16Clamp: return "NVPTXISD::Suld2DV4I16Clamp"; 1167 case NVPTXISD::Suld2DV4I32Clamp: return "NVPTXISD::Suld2DV4I32Clamp"; 1168 1169 case NVPTXISD::Suld2DArrayI8Clamp: return "NVPTXISD::Suld2DArrayI8Clamp"; 1170 case NVPTXISD::Suld2DArrayI16Clamp: return "NVPTXISD::Suld2DArrayI16Clamp"; 1171 case NVPTXISD::Suld2DArrayI32Clamp: return "NVPTXISD::Suld2DArrayI32Clamp"; 1172 case NVPTXISD::Suld2DArrayI64Clamp: return "NVPTXISD::Suld2DArrayI64Clamp"; 1173 case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp"; 1174 case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp"; 1175 case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp"; 1176 case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp"; 1177 case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp"; 1178 case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp"; 1179 case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp"; 1180 1181 case NVPTXISD::Suld3DI8Clamp: return "NVPTXISD::Suld3DI8Clamp"; 1182 case NVPTXISD::Suld3DI16Clamp: return "NVPTXISD::Suld3DI16Clamp"; 1183 case NVPTXISD::Suld3DI32Clamp: return "NVPTXISD::Suld3DI32Clamp"; 1184 case NVPTXISD::Suld3DI64Clamp: return "NVPTXISD::Suld3DI64Clamp"; 1185 case NVPTXISD::Suld3DV2I8Clamp: return "NVPTXISD::Suld3DV2I8Clamp"; 1186 case NVPTXISD::Suld3DV2I16Clamp: return "NVPTXISD::Suld3DV2I16Clamp"; 1187 case NVPTXISD::Suld3DV2I32Clamp: return "NVPTXISD::Suld3DV2I32Clamp"; 1188 case NVPTXISD::Suld3DV2I64Clamp: return "NVPTXISD::Suld3DV2I64Clamp"; 1189 case NVPTXISD::Suld3DV4I8Clamp: return "NVPTXISD::Suld3DV4I8Clamp"; 1190 case NVPTXISD::Suld3DV4I16Clamp: return "NVPTXISD::Suld3DV4I16Clamp"; 1191 case NVPTXISD::Suld3DV4I32Clamp: return "NVPTXISD::Suld3DV4I32Clamp"; 1192 1193 case NVPTXISD::Suld1DI8Trap: return "NVPTXISD::Suld1DI8Trap"; 1194 case NVPTXISD::Suld1DI16Trap: return "NVPTXISD::Suld1DI16Trap"; 1195 case NVPTXISD::Suld1DI32Trap: return "NVPTXISD::Suld1DI32Trap"; 1196 case NVPTXISD::Suld1DI64Trap: return "NVPTXISD::Suld1DI64Trap"; 1197 case NVPTXISD::Suld1DV2I8Trap: return "NVPTXISD::Suld1DV2I8Trap"; 1198 case NVPTXISD::Suld1DV2I16Trap: return "NVPTXISD::Suld1DV2I16Trap"; 1199 case NVPTXISD::Suld1DV2I32Trap: return "NVPTXISD::Suld1DV2I32Trap"; 1200 case NVPTXISD::Suld1DV2I64Trap: return "NVPTXISD::Suld1DV2I64Trap"; 1201 case NVPTXISD::Suld1DV4I8Trap: return "NVPTXISD::Suld1DV4I8Trap"; 1202 case NVPTXISD::Suld1DV4I16Trap: return "NVPTXISD::Suld1DV4I16Trap"; 1203 case NVPTXISD::Suld1DV4I32Trap: return "NVPTXISD::Suld1DV4I32Trap"; 1204 1205 case NVPTXISD::Suld1DArrayI8Trap: return "NVPTXISD::Suld1DArrayI8Trap"; 1206 case NVPTXISD::Suld1DArrayI16Trap: return "NVPTXISD::Suld1DArrayI16Trap"; 1207 case NVPTXISD::Suld1DArrayI32Trap: return "NVPTXISD::Suld1DArrayI32Trap"; 1208 case NVPTXISD::Suld1DArrayI64Trap: return "NVPTXISD::Suld1DArrayI64Trap"; 1209 case NVPTXISD::Suld1DArrayV2I8Trap: return "NVPTXISD::Suld1DArrayV2I8Trap"; 1210 case NVPTXISD::Suld1DArrayV2I16Trap: return "NVPTXISD::Suld1DArrayV2I16Trap"; 1211 case NVPTXISD::Suld1DArrayV2I32Trap: return "NVPTXISD::Suld1DArrayV2I32Trap"; 1212 case NVPTXISD::Suld1DArrayV2I64Trap: return "NVPTXISD::Suld1DArrayV2I64Trap"; 1213 case NVPTXISD::Suld1DArrayV4I8Trap: return "NVPTXISD::Suld1DArrayV4I8Trap"; 1214 case NVPTXISD::Suld1DArrayV4I16Trap: return "NVPTXISD::Suld1DArrayV4I16Trap"; 1215 case NVPTXISD::Suld1DArrayV4I32Trap: return "NVPTXISD::Suld1DArrayV4I32Trap"; 1216 1217 case NVPTXISD::Suld2DI8Trap: return "NVPTXISD::Suld2DI8Trap"; 1218 case NVPTXISD::Suld2DI16Trap: return "NVPTXISD::Suld2DI16Trap"; 1219 case NVPTXISD::Suld2DI32Trap: return "NVPTXISD::Suld2DI32Trap"; 1220 case NVPTXISD::Suld2DI64Trap: return "NVPTXISD::Suld2DI64Trap"; 1221 case NVPTXISD::Suld2DV2I8Trap: return "NVPTXISD::Suld2DV2I8Trap"; 1222 case NVPTXISD::Suld2DV2I16Trap: return "NVPTXISD::Suld2DV2I16Trap"; 1223 case NVPTXISD::Suld2DV2I32Trap: return "NVPTXISD::Suld2DV2I32Trap"; 1224 case NVPTXISD::Suld2DV2I64Trap: return "NVPTXISD::Suld2DV2I64Trap"; 1225 case NVPTXISD::Suld2DV4I8Trap: return "NVPTXISD::Suld2DV4I8Trap"; 1226 case NVPTXISD::Suld2DV4I16Trap: return "NVPTXISD::Suld2DV4I16Trap"; 1227 case NVPTXISD::Suld2DV4I32Trap: return "NVPTXISD::Suld2DV4I32Trap"; 1228 1229 case NVPTXISD::Suld2DArrayI8Trap: return "NVPTXISD::Suld2DArrayI8Trap"; 1230 case NVPTXISD::Suld2DArrayI16Trap: return "NVPTXISD::Suld2DArrayI16Trap"; 1231 case NVPTXISD::Suld2DArrayI32Trap: return "NVPTXISD::Suld2DArrayI32Trap"; 1232 case NVPTXISD::Suld2DArrayI64Trap: return "NVPTXISD::Suld2DArrayI64Trap"; 1233 case NVPTXISD::Suld2DArrayV2I8Trap: return "NVPTXISD::Suld2DArrayV2I8Trap"; 1234 case NVPTXISD::Suld2DArrayV2I16Trap: return "NVPTXISD::Suld2DArrayV2I16Trap"; 1235 case NVPTXISD::Suld2DArrayV2I32Trap: return "NVPTXISD::Suld2DArrayV2I32Trap"; 1236 case NVPTXISD::Suld2DArrayV2I64Trap: return "NVPTXISD::Suld2DArrayV2I64Trap"; 1237 case NVPTXISD::Suld2DArrayV4I8Trap: return "NVPTXISD::Suld2DArrayV4I8Trap"; 1238 case NVPTXISD::Suld2DArrayV4I16Trap: return "NVPTXISD::Suld2DArrayV4I16Trap"; 1239 case NVPTXISD::Suld2DArrayV4I32Trap: return "NVPTXISD::Suld2DArrayV4I32Trap"; 1240 1241 case NVPTXISD::Suld3DI8Trap: return "NVPTXISD::Suld3DI8Trap"; 1242 case NVPTXISD::Suld3DI16Trap: return "NVPTXISD::Suld3DI16Trap"; 1243 case NVPTXISD::Suld3DI32Trap: return "NVPTXISD::Suld3DI32Trap"; 1244 case NVPTXISD::Suld3DI64Trap: return "NVPTXISD::Suld3DI64Trap"; 1245 case NVPTXISD::Suld3DV2I8Trap: return "NVPTXISD::Suld3DV2I8Trap"; 1246 case NVPTXISD::Suld3DV2I16Trap: return "NVPTXISD::Suld3DV2I16Trap"; 1247 case NVPTXISD::Suld3DV2I32Trap: return "NVPTXISD::Suld3DV2I32Trap"; 1248 case NVPTXISD::Suld3DV2I64Trap: return "NVPTXISD::Suld3DV2I64Trap"; 1249 case NVPTXISD::Suld3DV4I8Trap: return "NVPTXISD::Suld3DV4I8Trap"; 1250 case NVPTXISD::Suld3DV4I16Trap: return "NVPTXISD::Suld3DV4I16Trap"; 1251 case NVPTXISD::Suld3DV4I32Trap: return "NVPTXISD::Suld3DV4I32Trap"; 1252 1253 case NVPTXISD::Suld1DI8Zero: return "NVPTXISD::Suld1DI8Zero"; 1254 case NVPTXISD::Suld1DI16Zero: return "NVPTXISD::Suld1DI16Zero"; 1255 case NVPTXISD::Suld1DI32Zero: return "NVPTXISD::Suld1DI32Zero"; 1256 case NVPTXISD::Suld1DI64Zero: return "NVPTXISD::Suld1DI64Zero"; 1257 case NVPTXISD::Suld1DV2I8Zero: return "NVPTXISD::Suld1DV2I8Zero"; 1258 case NVPTXISD::Suld1DV2I16Zero: return "NVPTXISD::Suld1DV2I16Zero"; 1259 case NVPTXISD::Suld1DV2I32Zero: return "NVPTXISD::Suld1DV2I32Zero"; 1260 case NVPTXISD::Suld1DV2I64Zero: return "NVPTXISD::Suld1DV2I64Zero"; 1261 case NVPTXISD::Suld1DV4I8Zero: return "NVPTXISD::Suld1DV4I8Zero"; 1262 case NVPTXISD::Suld1DV4I16Zero: return "NVPTXISD::Suld1DV4I16Zero"; 1263 case NVPTXISD::Suld1DV4I32Zero: return "NVPTXISD::Suld1DV4I32Zero"; 1264 1265 case NVPTXISD::Suld1DArrayI8Zero: return "NVPTXISD::Suld1DArrayI8Zero"; 1266 case NVPTXISD::Suld1DArrayI16Zero: return "NVPTXISD::Suld1DArrayI16Zero"; 1267 case NVPTXISD::Suld1DArrayI32Zero: return "NVPTXISD::Suld1DArrayI32Zero"; 1268 case NVPTXISD::Suld1DArrayI64Zero: return "NVPTXISD::Suld1DArrayI64Zero"; 1269 case NVPTXISD::Suld1DArrayV2I8Zero: return "NVPTXISD::Suld1DArrayV2I8Zero"; 1270 case NVPTXISD::Suld1DArrayV2I16Zero: return "NVPTXISD::Suld1DArrayV2I16Zero"; 1271 case NVPTXISD::Suld1DArrayV2I32Zero: return "NVPTXISD::Suld1DArrayV2I32Zero"; 1272 case NVPTXISD::Suld1DArrayV2I64Zero: return "NVPTXISD::Suld1DArrayV2I64Zero"; 1273 case NVPTXISD::Suld1DArrayV4I8Zero: return "NVPTXISD::Suld1DArrayV4I8Zero"; 1274 case NVPTXISD::Suld1DArrayV4I16Zero: return "NVPTXISD::Suld1DArrayV4I16Zero"; 1275 case NVPTXISD::Suld1DArrayV4I32Zero: return "NVPTXISD::Suld1DArrayV4I32Zero"; 1276 1277 case NVPTXISD::Suld2DI8Zero: return "NVPTXISD::Suld2DI8Zero"; 1278 case NVPTXISD::Suld2DI16Zero: return "NVPTXISD::Suld2DI16Zero"; 1279 case NVPTXISD::Suld2DI32Zero: return "NVPTXISD::Suld2DI32Zero"; 1280 case NVPTXISD::Suld2DI64Zero: return "NVPTXISD::Suld2DI64Zero"; 1281 case NVPTXISD::Suld2DV2I8Zero: return "NVPTXISD::Suld2DV2I8Zero"; 1282 case NVPTXISD::Suld2DV2I16Zero: return "NVPTXISD::Suld2DV2I16Zero"; 1283 case NVPTXISD::Suld2DV2I32Zero: return "NVPTXISD::Suld2DV2I32Zero"; 1284 case NVPTXISD::Suld2DV2I64Zero: return "NVPTXISD::Suld2DV2I64Zero"; 1285 case NVPTXISD::Suld2DV4I8Zero: return "NVPTXISD::Suld2DV4I8Zero"; 1286 case NVPTXISD::Suld2DV4I16Zero: return "NVPTXISD::Suld2DV4I16Zero"; 1287 case NVPTXISD::Suld2DV4I32Zero: return "NVPTXISD::Suld2DV4I32Zero"; 1288 1289 case NVPTXISD::Suld2DArrayI8Zero: return "NVPTXISD::Suld2DArrayI8Zero"; 1290 case NVPTXISD::Suld2DArrayI16Zero: return "NVPTXISD::Suld2DArrayI16Zero"; 1291 case NVPTXISD::Suld2DArrayI32Zero: return "NVPTXISD::Suld2DArrayI32Zero"; 1292 case NVPTXISD::Suld2DArrayI64Zero: return "NVPTXISD::Suld2DArrayI64Zero"; 1293 case NVPTXISD::Suld2DArrayV2I8Zero: return "NVPTXISD::Suld2DArrayV2I8Zero"; 1294 case NVPTXISD::Suld2DArrayV2I16Zero: return "NVPTXISD::Suld2DArrayV2I16Zero"; 1295 case NVPTXISD::Suld2DArrayV2I32Zero: return "NVPTXISD::Suld2DArrayV2I32Zero"; 1296 case NVPTXISD::Suld2DArrayV2I64Zero: return "NVPTXISD::Suld2DArrayV2I64Zero"; 1297 case NVPTXISD::Suld2DArrayV4I8Zero: return "NVPTXISD::Suld2DArrayV4I8Zero"; 1298 case NVPTXISD::Suld2DArrayV4I16Zero: return "NVPTXISD::Suld2DArrayV4I16Zero"; 1299 case NVPTXISD::Suld2DArrayV4I32Zero: return "NVPTXISD::Suld2DArrayV4I32Zero"; 1300 1301 case NVPTXISD::Suld3DI8Zero: return "NVPTXISD::Suld3DI8Zero"; 1302 case NVPTXISD::Suld3DI16Zero: return "NVPTXISD::Suld3DI16Zero"; 1303 case NVPTXISD::Suld3DI32Zero: return "NVPTXISD::Suld3DI32Zero"; 1304 case NVPTXISD::Suld3DI64Zero: return "NVPTXISD::Suld3DI64Zero"; 1305 case NVPTXISD::Suld3DV2I8Zero: return "NVPTXISD::Suld3DV2I8Zero"; 1306 case NVPTXISD::Suld3DV2I16Zero: return "NVPTXISD::Suld3DV2I16Zero"; 1307 case NVPTXISD::Suld3DV2I32Zero: return "NVPTXISD::Suld3DV2I32Zero"; 1308 case NVPTXISD::Suld3DV2I64Zero: return "NVPTXISD::Suld3DV2I64Zero"; 1309 case NVPTXISD::Suld3DV4I8Zero: return "NVPTXISD::Suld3DV4I8Zero"; 1310 case NVPTXISD::Suld3DV4I16Zero: return "NVPTXISD::Suld3DV4I16Zero"; 1311 case NVPTXISD::Suld3DV4I32Zero: return "NVPTXISD::Suld3DV4I32Zero"; 1312 } 1313 return nullptr; 1314 } 1315 1316 TargetLoweringBase::LegalizeTypeAction 1317 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1318 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1319 VT.getScalarType() == MVT::i1) 1320 return TypeSplitVector; 1321 if (Isv2f16Orv2bf16Type(VT)) 1322 return TypeLegal; 1323 return TargetLoweringBase::getPreferredVectorAction(VT); 1324 } 1325 1326 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1327 int Enabled, int &ExtraSteps, 1328 bool &UseOneConst, 1329 bool Reciprocal) const { 1330 if (!(Enabled == ReciprocalEstimate::Enabled || 1331 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1332 return SDValue(); 1333 1334 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1335 ExtraSteps = 0; 1336 1337 SDLoc DL(Operand); 1338 EVT VT = Operand.getValueType(); 1339 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1340 1341 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1342 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1343 DAG.getConstant(IID, DL, MVT::i32), Operand); 1344 }; 1345 1346 // The sqrt and rsqrt refinement processes assume we always start out with an 1347 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1348 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1349 // any refinement, we must return a regular sqrt. 1350 if (Reciprocal || ExtraSteps > 0) { 1351 if (VT == MVT::f32) 1352 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1353 : Intrinsic::nvvm_rsqrt_approx_f); 1354 else if (VT == MVT::f64) 1355 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1356 else 1357 return SDValue(); 1358 } else { 1359 if (VT == MVT::f32) 1360 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1361 : Intrinsic::nvvm_sqrt_approx_f); 1362 else { 1363 // There's no sqrt.approx.f64 instruction, so we emit 1364 // reciprocal(rsqrt(x)). This is faster than 1365 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1366 // x * rsqrt(x).) 1367 return DAG.getNode( 1368 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1369 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1370 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1371 } 1372 } 1373 } 1374 1375 SDValue 1376 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1377 SDLoc dl(Op); 1378 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1379 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1380 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1381 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1382 } 1383 1384 static bool IsTypePassedAsArray(const Type *Ty) { 1385 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1386 Ty->isHalfTy() || Ty->isBFloatTy(); 1387 } 1388 1389 std::string NVPTXTargetLowering::getPrototype( 1390 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1391 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1392 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1393 const CallBase &CB, unsigned UniqueCallSite) const { 1394 auto PtrVT = getPointerTy(DL); 1395 1396 bool isABI = (STI.getSmVersion() >= 20); 1397 assert(isABI && "Non-ABI compilation is not supported"); 1398 if (!isABI) 1399 return ""; 1400 1401 std::string Prototype; 1402 raw_string_ostream O(Prototype); 1403 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1404 1405 if (retTy->getTypeID() == Type::VoidTyID) { 1406 O << "()"; 1407 } else { 1408 O << "("; 1409 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1410 !IsTypePassedAsArray(retTy)) { 1411 unsigned size = 0; 1412 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1413 size = ITy->getBitWidth(); 1414 } else { 1415 assert(retTy->isFloatingPointTy() && 1416 "Floating point type expected here"); 1417 size = retTy->getPrimitiveSizeInBits(); 1418 } 1419 // PTX ABI requires all scalar return values to be at least 32 1420 // bits in size. fp16 normally uses .b16 as its storage type in 1421 // PTX, so its size must be adjusted here, too. 1422 size = promoteScalarArgumentSize(size); 1423 1424 O << ".param .b" << size << " _"; 1425 } else if (isa<PointerType>(retTy)) { 1426 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1427 } else if (IsTypePassedAsArray(retTy)) { 1428 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1429 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1430 } else { 1431 llvm_unreachable("Unknown return type"); 1432 } 1433 O << ") "; 1434 } 1435 O << "_ ("; 1436 1437 bool first = true; 1438 1439 const Function *F = CB.getFunction(); 1440 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1441 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1442 Type *Ty = Args[i].Ty; 1443 if (!first) { 1444 O << ", "; 1445 } 1446 first = false; 1447 1448 if (!Outs[OIdx].Flags.isByVal()) { 1449 if (IsTypePassedAsArray(Ty)) { 1450 unsigned ParamAlign = 0; 1451 const CallInst *CallI = cast<CallInst>(&CB); 1452 // +1 because index 0 is reserved for return type alignment 1453 if (!getAlign(*CallI, i + 1, ParamAlign)) 1454 ParamAlign = getFunctionParamOptimizedAlign(F, Ty, DL).value(); 1455 O << ".param .align " << ParamAlign << " .b8 "; 1456 O << "_"; 1457 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1458 // update the index for Outs 1459 SmallVector<EVT, 16> vtparts; 1460 ComputeValueVTs(*this, DL, Ty, vtparts); 1461 if (unsigned len = vtparts.size()) 1462 OIdx += len - 1; 1463 continue; 1464 } 1465 // i8 types in IR will be i16 types in SDAG 1466 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1467 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1468 "type mismatch between callee prototype and arguments"); 1469 // scalar type 1470 unsigned sz = 0; 1471 if (isa<IntegerType>(Ty)) { 1472 sz = cast<IntegerType>(Ty)->getBitWidth(); 1473 sz = promoteScalarArgumentSize(sz); 1474 } else if (isa<PointerType>(Ty)) { 1475 sz = PtrVT.getSizeInBits(); 1476 } else { 1477 sz = Ty->getPrimitiveSizeInBits(); 1478 } 1479 O << ".param .b" << sz << " "; 1480 O << "_"; 1481 continue; 1482 } 1483 1484 Type *ETy = Args[i].IndirectType; 1485 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1486 Align ParamByValAlign = 1487 getFunctionByValParamAlign(F, ETy, InitialAlign, DL); 1488 1489 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1490 O << "_"; 1491 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1492 } 1493 1494 if (VAInfo) 1495 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1496 << " .b8 _[]\n"; 1497 O << ")"; 1498 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1499 O << " .noreturn"; 1500 O << ";"; 1501 1502 return Prototype; 1503 } 1504 1505 Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, 1506 const CallBase *CB, Type *Ty, 1507 unsigned Idx, 1508 const DataLayout &DL) const { 1509 if (!CB) { 1510 // CallSite is zero, fallback to ABI type alignment 1511 return DL.getABITypeAlign(Ty); 1512 } 1513 1514 unsigned Alignment = 0; 1515 const Function *DirectCallee = CB->getCalledFunction(); 1516 1517 if (!DirectCallee) { 1518 // We don't have a direct function symbol, but that may be because of 1519 // constant cast instructions in the call. 1520 1521 // With bitcast'd call targets, the instruction will be the call 1522 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1523 // Check if we have call alignment metadata 1524 if (getAlign(*CI, Idx, Alignment)) 1525 return Align(Alignment); 1526 } 1527 DirectCallee = getMaybeBitcastedCallee(CB); 1528 } 1529 1530 // Check for function alignment information if we found that the 1531 // ultimate target is a Function 1532 if (DirectCallee) { 1533 if (getAlign(*DirectCallee, Idx, Alignment)) 1534 return Align(Alignment); 1535 // If alignment information is not available, fall back to the 1536 // default function param optimized type alignment 1537 return getFunctionParamOptimizedAlign(DirectCallee, Ty, DL); 1538 } 1539 1540 // Call is indirect, fall back to the ABI type alignment 1541 return DL.getABITypeAlign(Ty); 1542 } 1543 1544 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1545 SmallVectorImpl<SDValue> &InVals) const { 1546 1547 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1548 report_fatal_error( 1549 "Support for variadic functions (unsized array parameter) introduced " 1550 "in PTX ISA version 6.0 and requires target sm_30."); 1551 1552 SelectionDAG &DAG = CLI.DAG; 1553 SDLoc dl = CLI.DL; 1554 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1555 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1556 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1557 SDValue Chain = CLI.Chain; 1558 SDValue Callee = CLI.Callee; 1559 bool &isTailCall = CLI.IsTailCall; 1560 ArgListTy &Args = CLI.getArgs(); 1561 Type *RetTy = CLI.RetTy; 1562 const CallBase *CB = CLI.CB; 1563 const DataLayout &DL = DAG.getDataLayout(); 1564 1565 bool isABI = (STI.getSmVersion() >= 20); 1566 assert(isABI && "Non-ABI compilation is not supported"); 1567 if (!isABI) 1568 return Chain; 1569 1570 // Variadic arguments. 1571 // 1572 // Normally, for each argument, we declare a param scalar or a param 1573 // byte array in the .param space, and store the argument value to that 1574 // param scalar or array starting at offset 0. 1575 // 1576 // In the case of the first variadic argument, we declare a vararg byte array 1577 // with size 0. The exact size of this array isn't known at this point, so 1578 // it'll be patched later. All the variadic arguments will be stored to this 1579 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1580 // initially set to 0, so it can be used for non-variadic arguments (which use 1581 // 0 offset) to simplify the code. 1582 // 1583 // After all vararg is processed, 'VAOffset' holds the size of the 1584 // vararg byte array. 1585 1586 SDValue VADeclareParam; // vararg byte array 1587 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1588 unsigned VAOffset = 0; // current offset in the param array 1589 1590 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1591 SDValue TempChain = Chain; 1592 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1593 SDValue InGlue = Chain.getValue(1); 1594 1595 unsigned ParamCount = 0; 1596 // Args.size() and Outs.size() need not match. 1597 // Outs.size() will be larger 1598 // * if there is an aggregate argument with multiple fields (each field 1599 // showing up separately in Outs) 1600 // * if there is a vector argument with more than typical vector-length 1601 // elements (generally if more than 4) where each vector element is 1602 // individually present in Outs. 1603 // So a different index should be used for indexing into Outs/OutVals. 1604 // See similar issue in LowerFormalArguments. 1605 unsigned OIdx = 0; 1606 // Declare the .params or .reg need to pass values 1607 // to the function 1608 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1609 EVT VT = Outs[OIdx].VT; 1610 Type *Ty = Args[i].Ty; 1611 bool IsVAArg = (i >= CLI.NumFixedArgs); 1612 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1613 1614 SmallVector<EVT, 16> VTs; 1615 SmallVector<uint64_t, 16> Offsets; 1616 1617 assert((!IsByVal || Args[i].IndirectType) && 1618 "byval arg must have indirect type"); 1619 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1620 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1621 1622 Align ArgAlign; 1623 if (IsByVal) { 1624 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1625 // so we don't need to worry whether it's naturally aligned or not. 1626 // See TargetLowering::LowerCallTo(). 1627 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1628 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1629 InitialAlign, DL); 1630 if (IsVAArg) 1631 VAOffset = alignTo(VAOffset, ArgAlign); 1632 } else { 1633 ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL); 1634 } 1635 1636 unsigned TypeSize = 1637 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1638 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1639 1640 bool NeedAlign; // Does argument declaration specify alignment? 1641 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1642 if (IsVAArg) { 1643 if (ParamCount == FirstVAArg) { 1644 SDValue DeclareParamOps[] = { 1645 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1646 DAG.getConstant(ParamCount, dl, MVT::i32), 1647 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1648 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1649 DeclareParamVTs, DeclareParamOps); 1650 } 1651 NeedAlign = PassAsArray; 1652 } else if (PassAsArray) { 1653 // declare .param .align <align> .b8 .param<n>[<size>]; 1654 SDValue DeclareParamOps[] = { 1655 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1656 DAG.getConstant(ParamCount, dl, MVT::i32), 1657 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1658 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1659 DeclareParamOps); 1660 NeedAlign = true; 1661 } else { 1662 // declare .param .b<size> .param<n>; 1663 if (VT.isInteger() || VT.isFloatingPoint()) { 1664 // PTX ABI requires integral types to be at least 32 bits in 1665 // size. FP16 is loaded/stored using i16, so it's handled 1666 // here as well. 1667 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1668 } 1669 SDValue DeclareScalarParamOps[] = { 1670 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1671 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1672 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1673 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1674 DeclareScalarParamOps); 1675 NeedAlign = false; 1676 } 1677 InGlue = Chain.getValue(1); 1678 1679 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1680 // than 32-bits are sign extended or zero extended, depending on 1681 // whether they are signed or unsigned types. This case applies 1682 // only to scalar parameters and not to aggregate values. 1683 bool ExtendIntegerParam = 1684 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1685 1686 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1687 SmallVector<SDValue, 6> StoreOperands; 1688 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1689 EVT EltVT = VTs[j]; 1690 int CurOffset = Offsets[j]; 1691 MaybeAlign PartAlign; 1692 if (NeedAlign) 1693 PartAlign = commonAlignment(ArgAlign, CurOffset); 1694 1695 // New store. 1696 if (VectorInfo[j] & PVF_FIRST) { 1697 assert(StoreOperands.empty() && "Unfinished preceding store."); 1698 StoreOperands.push_back(Chain); 1699 StoreOperands.push_back( 1700 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1701 StoreOperands.push_back(DAG.getConstant( 1702 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1703 dl, MVT::i32)); 1704 } 1705 1706 SDValue StVal = OutVals[OIdx]; 1707 1708 MVT PromotedVT; 1709 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1710 EltVT = EVT(PromotedVT); 1711 } 1712 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1713 llvm::ISD::NodeType Ext = 1714 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1715 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1716 } 1717 1718 if (IsByVal) { 1719 auto PtrVT = getPointerTy(DL); 1720 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1721 DAG.getConstant(CurOffset, dl, PtrVT)); 1722 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1723 PartAlign); 1724 } else if (ExtendIntegerParam) { 1725 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1726 // zext/sext to i32 1727 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1728 : ISD::ZERO_EXTEND, 1729 dl, MVT::i32, StVal); 1730 } 1731 1732 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1733 // Use 16-bit registers for small stores as it's the 1734 // smallest general purpose register size supported by NVPTX. 1735 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1736 } 1737 1738 // Record the value to store. 1739 StoreOperands.push_back(StVal); 1740 1741 if (VectorInfo[j] & PVF_LAST) { 1742 unsigned NumElts = StoreOperands.size() - 3; 1743 NVPTXISD::NodeType Op; 1744 switch (NumElts) { 1745 case 1: 1746 Op = NVPTXISD::StoreParam; 1747 break; 1748 case 2: 1749 Op = NVPTXISD::StoreParamV2; 1750 break; 1751 case 4: 1752 Op = NVPTXISD::StoreParamV4; 1753 break; 1754 default: 1755 llvm_unreachable("Invalid vector info."); 1756 } 1757 1758 StoreOperands.push_back(InGlue); 1759 1760 // Adjust type of the store op if we've extended the scalar 1761 // return value. 1762 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1763 1764 Chain = DAG.getMemIntrinsicNode( 1765 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1766 TheStoreType, MachinePointerInfo(), PartAlign, 1767 MachineMemOperand::MOStore); 1768 InGlue = Chain.getValue(1); 1769 1770 // Cleanup. 1771 StoreOperands.clear(); 1772 1773 // TODO: We may need to support vector types that can be passed 1774 // as scalars in variadic arguments. 1775 if (!IsByVal && IsVAArg) { 1776 assert(NumElts == 1 && 1777 "Vectorization is expected to be disabled for variadics."); 1778 VAOffset += DL.getTypeAllocSize( 1779 TheStoreType.getTypeForEVT(*DAG.getContext())); 1780 } 1781 } 1782 if (!IsByVal) 1783 ++OIdx; 1784 } 1785 assert(StoreOperands.empty() && "Unfinished parameter store."); 1786 if (!IsByVal && VTs.size() > 0) 1787 --OIdx; 1788 ++ParamCount; 1789 if (IsByVal && IsVAArg) 1790 VAOffset += TypeSize; 1791 } 1792 1793 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1794 MaybeAlign retAlignment = std::nullopt; 1795 1796 // Handle Result 1797 if (Ins.size() > 0) { 1798 SmallVector<EVT, 16> resvtparts; 1799 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1800 1801 // Declare 1802 // .param .align N .b8 retval0[<size-in-bytes>], or 1803 // .param .b<size-in-bits> retval0 1804 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1805 if (!IsTypePassedAsArray(RetTy)) { 1806 resultsz = promoteScalarArgumentSize(resultsz); 1807 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1808 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1809 DAG.getConstant(resultsz, dl, MVT::i32), 1810 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1811 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1812 DeclareRetOps); 1813 InGlue = Chain.getValue(1); 1814 } else { 1815 retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1816 assert(retAlignment && "retAlignment is guaranteed to be set"); 1817 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1818 SDValue DeclareRetOps[] = { 1819 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1820 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1821 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1822 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1823 DeclareRetOps); 1824 InGlue = Chain.getValue(1); 1825 } 1826 } 1827 1828 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1829 // Set the size of the vararg param byte array if the callee is a variadic 1830 // function and the variadic part is not empty. 1831 if (HasVAArgs) { 1832 SDValue DeclareParamOps[] = { 1833 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1834 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1835 VADeclareParam.getOperand(4)}; 1836 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1837 VADeclareParam->getVTList(), DeclareParamOps); 1838 } 1839 1840 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1841 // between them we must rely on the call site value which is valid for 1842 // indirect calls but is always null for libcalls. 1843 bool isIndirectCall = !Func && CB; 1844 1845 if (isa<ExternalSymbolSDNode>(Callee)) { 1846 Function* CalleeFunc = nullptr; 1847 1848 // Try to find the callee in the current module. 1849 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1850 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1851 1852 // Set the "libcall callee" attribute to indicate that the function 1853 // must always have a declaration. 1854 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1855 } 1856 1857 if (isIndirectCall) { 1858 // This is indirect function call case : PTX requires a prototype of the 1859 // form 1860 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1861 // to be emitted, and the label has to used as the last arg of call 1862 // instruction. 1863 // The prototype is embedded in a string and put as the operand for a 1864 // CallPrototype SDNode which will print out to the value of the string. 1865 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1866 std::string Proto = getPrototype( 1867 DL, RetTy, Args, Outs, retAlignment, 1868 HasVAArgs 1869 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 1870 CLI.NumFixedArgs, 1871 cast<ConstantSDNode>(VADeclareParam->getOperand(1)) 1872 ->getAPIntValue())) 1873 : std::nullopt, 1874 *CB, UniqueCallSite); 1875 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 1876 SDValue ProtoOps[] = { 1877 Chain, 1878 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 1879 InGlue, 1880 }; 1881 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1882 InGlue = Chain.getValue(1); 1883 } 1884 // Op to just print "call" 1885 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1886 SDValue PrintCallOps[] = { 1887 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 1888 }; 1889 // We model convergent calls as separate opcodes. 1890 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1891 if (CLI.IsConvergent) 1892 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1893 : NVPTXISD::PrintConvergentCall; 1894 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1895 InGlue = Chain.getValue(1); 1896 1897 // Ops to print out the function name 1898 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1899 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 1900 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 1901 InGlue = Chain.getValue(1); 1902 1903 // Ops to print out the param list 1904 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1905 SDValue CallArgBeginOps[] = { Chain, InGlue }; 1906 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 1907 CallArgBeginOps); 1908 InGlue = Chain.getValue(1); 1909 1910 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 1911 ++i) { 1912 unsigned opcode; 1913 if (i == (e - 1)) 1914 opcode = NVPTXISD::LastCallArg; 1915 else 1916 opcode = NVPTXISD::CallArg; 1917 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1918 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1919 DAG.getConstant(i, dl, MVT::i32), InGlue }; 1920 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 1921 InGlue = Chain.getValue(1); 1922 } 1923 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1924 SDValue CallArgEndOps[] = { Chain, 1925 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 1926 InGlue }; 1927 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 1928 InGlue = Chain.getValue(1); 1929 1930 if (isIndirectCall) { 1931 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1932 SDValue PrototypeOps[] = { 1933 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 1934 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 1935 InGlue = Chain.getValue(1); 1936 } 1937 1938 SmallVector<SDValue, 16> ProxyRegOps; 1939 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 1940 1941 // Generate loads from param memory/moves from registers for result 1942 if (Ins.size() > 0) { 1943 SmallVector<EVT, 16> VTs; 1944 SmallVector<uint64_t, 16> Offsets; 1945 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 1946 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 1947 1948 Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL); 1949 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 1950 1951 SmallVector<EVT, 6> LoadVTs; 1952 int VecIdx = -1; // Index of the first element of the vector. 1953 1954 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1955 // 32-bits are sign extended or zero extended, depending on whether 1956 // they are signed or unsigned types. 1957 bool ExtendIntegerRetVal = 1958 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 1959 1960 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 1961 bool needTruncate = false; 1962 EVT TheLoadType = VTs[i]; 1963 EVT EltType = Ins[i].VT; 1964 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 1965 MVT PromotedVT; 1966 1967 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 1968 TheLoadType = EVT(PromotedVT); 1969 EltType = EVT(PromotedVT); 1970 needTruncate = true; 1971 } 1972 1973 if (ExtendIntegerRetVal) { 1974 TheLoadType = MVT::i32; 1975 EltType = MVT::i32; 1976 needTruncate = true; 1977 } else if (TheLoadType.getSizeInBits() < 16) { 1978 if (VTs[i].isInteger()) 1979 needTruncate = true; 1980 EltType = MVT::i16; 1981 } 1982 1983 // Record index of the very first element of the vector. 1984 if (VectorInfo[i] & PVF_FIRST) { 1985 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 1986 VecIdx = i; 1987 } 1988 1989 LoadVTs.push_back(EltType); 1990 1991 if (VectorInfo[i] & PVF_LAST) { 1992 unsigned NumElts = LoadVTs.size(); 1993 LoadVTs.push_back(MVT::Other); 1994 LoadVTs.push_back(MVT::Glue); 1995 NVPTXISD::NodeType Op; 1996 switch (NumElts) { 1997 case 1: 1998 Op = NVPTXISD::LoadParam; 1999 break; 2000 case 2: 2001 Op = NVPTXISD::LoadParamV2; 2002 break; 2003 case 4: 2004 Op = NVPTXISD::LoadParamV4; 2005 break; 2006 default: 2007 llvm_unreachable("Invalid vector info."); 2008 } 2009 2010 SDValue LoadOperands[] = { 2011 Chain, DAG.getConstant(1, dl, MVT::i32), 2012 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2013 SDValue RetVal = DAG.getMemIntrinsicNode( 2014 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2015 MachinePointerInfo(), EltAlign, 2016 MachineMemOperand::MOLoad); 2017 2018 for (unsigned j = 0; j < NumElts; ++j) { 2019 ProxyRegOps.push_back(RetVal.getValue(j)); 2020 2021 if (needTruncate) 2022 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2023 else 2024 ProxyRegTruncates.push_back(std::optional<MVT>()); 2025 } 2026 2027 Chain = RetVal.getValue(NumElts); 2028 InGlue = RetVal.getValue(NumElts + 1); 2029 2030 // Cleanup 2031 VecIdx = -1; 2032 LoadVTs.clear(); 2033 } 2034 } 2035 } 2036 2037 Chain = 2038 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2039 InGlue = Chain.getValue(1); 2040 2041 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2042 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2043 // dangling. 2044 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2045 SDValue Ret = DAG.getNode( 2046 NVPTXISD::ProxyReg, dl, 2047 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2048 { Chain, ProxyRegOps[i], InGlue } 2049 ); 2050 2051 Chain = Ret.getValue(1); 2052 InGlue = Ret.getValue(2); 2053 2054 if (ProxyRegTruncates[i]) { 2055 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2056 } 2057 2058 InVals.push_back(Ret); 2059 } 2060 2061 // set isTailCall to false for now, until we figure out how to express 2062 // tail call optimization in PTX 2063 isTailCall = false; 2064 return Chain; 2065 } 2066 2067 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2068 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2069 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2070 SDValue 2071 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2072 SDNode *Node = Op.getNode(); 2073 SDLoc dl(Node); 2074 SmallVector<SDValue, 8> Ops; 2075 unsigned NumOperands = Node->getNumOperands(); 2076 for (unsigned i = 0; i < NumOperands; ++i) { 2077 SDValue SubOp = Node->getOperand(i); 2078 EVT VVT = SubOp.getNode()->getValueType(0); 2079 EVT EltVT = VVT.getVectorElementType(); 2080 unsigned NumSubElem = VVT.getVectorNumElements(); 2081 for (unsigned j = 0; j < NumSubElem; ++j) { 2082 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2083 DAG.getIntPtrConstant(j, dl))); 2084 } 2085 } 2086 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2087 } 2088 2089 // We can init constant f16x2 with a single .b32 move. Normally it 2090 // would get lowered as two constant loads and vector-packing move. 2091 // mov.b16 %h1, 0x4000; 2092 // mov.b16 %h2, 0x3C00; 2093 // mov.b32 %hh2, {%h2, %h1}; 2094 // Instead we want just a constant move: 2095 // mov.b32 %hh2, 0x40003C00 2096 // 2097 // This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0 2098 // generates good SASS in both cases. 2099 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2100 SelectionDAG &DAG) const { 2101 if (!(Isv2f16Orv2bf16Type(Op->getValueType(0)) && 2102 isa<ConstantFPSDNode>(Op->getOperand(0)) && 2103 isa<ConstantFPSDNode>(Op->getOperand(1)))) 2104 return Op; 2105 2106 APInt E0 = 2107 cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt(); 2108 APInt E1 = 2109 cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt(); 2110 SDValue Const = 2111 DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32); 2112 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2113 } 2114 2115 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2116 SelectionDAG &DAG) const { 2117 SDValue Index = Op->getOperand(1); 2118 // Constant index will be matched by tablegen. 2119 if (isa<ConstantSDNode>(Index.getNode())) 2120 return Op; 2121 2122 // Extract individual elements and select one of them. 2123 SDValue Vector = Op->getOperand(0); 2124 EVT VectorVT = Vector.getValueType(); 2125 assert(VectorVT == MVT::v2f16 && "Unexpected vector type."); 2126 EVT EltVT = VectorVT.getVectorElementType(); 2127 2128 SDLoc dl(Op.getNode()); 2129 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2130 DAG.getIntPtrConstant(0, dl)); 2131 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2132 DAG.getIntPtrConstant(1, dl)); 2133 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2134 ISD::CondCode::SETEQ); 2135 } 2136 2137 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2138 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2139 /// amount, or 2140 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2141 /// amount. 2142 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2143 SelectionDAG &DAG) const { 2144 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2145 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2146 2147 EVT VT = Op.getValueType(); 2148 unsigned VTBits = VT.getSizeInBits(); 2149 SDLoc dl(Op); 2150 SDValue ShOpLo = Op.getOperand(0); 2151 SDValue ShOpHi = Op.getOperand(1); 2152 SDValue ShAmt = Op.getOperand(2); 2153 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2154 2155 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2156 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2157 // {dHi, dLo} = {aHi, aLo} >> Amt 2158 // dHi = aHi >> Amt 2159 // dLo = shf.r.clamp aLo, aHi, Amt 2160 2161 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2162 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2163 ShAmt); 2164 2165 SDValue Ops[2] = { Lo, Hi }; 2166 return DAG.getMergeValues(Ops, dl); 2167 } 2168 else { 2169 // {dHi, dLo} = {aHi, aLo} >> Amt 2170 // - if (Amt>=size) then 2171 // dLo = aHi >> (Amt-size) 2172 // dHi = aHi >> Amt (this is either all 0 or all 1) 2173 // else 2174 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2175 // dHi = aHi >> Amt 2176 2177 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2178 DAG.getConstant(VTBits, dl, MVT::i32), 2179 ShAmt); 2180 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2181 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2182 DAG.getConstant(VTBits, dl, MVT::i32)); 2183 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2184 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2185 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2186 2187 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2188 DAG.getConstant(VTBits, dl, MVT::i32), 2189 ISD::SETGE); 2190 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2191 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2192 2193 SDValue Ops[2] = { Lo, Hi }; 2194 return DAG.getMergeValues(Ops, dl); 2195 } 2196 } 2197 2198 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2199 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2200 /// amount, or 2201 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2202 /// amount. 2203 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2204 SelectionDAG &DAG) const { 2205 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2206 assert(Op.getOpcode() == ISD::SHL_PARTS); 2207 2208 EVT VT = Op.getValueType(); 2209 unsigned VTBits = VT.getSizeInBits(); 2210 SDLoc dl(Op); 2211 SDValue ShOpLo = Op.getOperand(0); 2212 SDValue ShOpHi = Op.getOperand(1); 2213 SDValue ShAmt = Op.getOperand(2); 2214 2215 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2216 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2217 // {dHi, dLo} = {aHi, aLo} << Amt 2218 // dHi = shf.l.clamp aLo, aHi, Amt 2219 // dLo = aLo << Amt 2220 2221 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2222 ShAmt); 2223 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2224 2225 SDValue Ops[2] = { Lo, Hi }; 2226 return DAG.getMergeValues(Ops, dl); 2227 } 2228 else { 2229 // {dHi, dLo} = {aHi, aLo} << Amt 2230 // - if (Amt>=size) then 2231 // dLo = aLo << Amt (all 0) 2232 // dLo = aLo << (Amt-size) 2233 // else 2234 // dLo = aLo << Amt 2235 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2236 2237 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2238 DAG.getConstant(VTBits, dl, MVT::i32), 2239 ShAmt); 2240 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2241 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2242 DAG.getConstant(VTBits, dl, MVT::i32)); 2243 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2244 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2245 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2246 2247 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2248 DAG.getConstant(VTBits, dl, MVT::i32), 2249 ISD::SETGE); 2250 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2251 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2252 2253 SDValue Ops[2] = { Lo, Hi }; 2254 return DAG.getMergeValues(Ops, dl); 2255 } 2256 } 2257 2258 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2259 EVT VT = Op.getValueType(); 2260 2261 if (VT == MVT::f32) 2262 return LowerFROUND32(Op, DAG); 2263 2264 if (VT == MVT::f64) 2265 return LowerFROUND64(Op, DAG); 2266 2267 llvm_unreachable("unhandled type"); 2268 } 2269 2270 // This is the the rounding method used in CUDA libdevice in C like code: 2271 // float roundf(float A) 2272 // { 2273 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2274 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2275 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2276 // } 2277 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2278 SelectionDAG &DAG) const { 2279 SDLoc SL(Op); 2280 SDValue A = Op.getOperand(0); 2281 EVT VT = Op.getValueType(); 2282 2283 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2284 2285 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2286 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2287 const int SignBitMask = 0x80000000; 2288 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2289 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2290 const int PointFiveInBits = 0x3F000000; 2291 SDValue PointFiveWithSignRaw = 2292 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2293 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2294 SDValue PointFiveWithSign = 2295 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2296 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2297 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2298 2299 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2300 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2301 SDValue IsLarge = 2302 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2303 ISD::SETOGT); 2304 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2305 2306 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2307 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2308 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2309 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2310 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2311 } 2312 2313 // The implementation of round(double) is similar to that of round(float) in 2314 // that they both separate the value range into three regions and use a method 2315 // specific to the region to round the values. However, round(double) first 2316 // calculates the round of the absolute value and then adds the sign back while 2317 // round(float) directly rounds the value with sign. 2318 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2319 SelectionDAG &DAG) const { 2320 SDLoc SL(Op); 2321 SDValue A = Op.getOperand(0); 2322 EVT VT = Op.getValueType(); 2323 2324 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2325 2326 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2327 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2328 DAG.getConstantFP(0.5, SL, VT)); 2329 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2330 2331 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2332 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2333 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2334 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2335 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2336 DAG.getConstantFP(0, SL, VT), 2337 RoundedA); 2338 2339 // Add sign to rounded_A 2340 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2341 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2342 2343 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2344 SDValue IsLarge = 2345 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2346 ISD::SETOGT); 2347 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2348 } 2349 2350 2351 2352 SDValue 2353 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2354 switch (Op.getOpcode()) { 2355 case ISD::RETURNADDR: 2356 return SDValue(); 2357 case ISD::FRAMEADDR: 2358 return SDValue(); 2359 case ISD::GlobalAddress: 2360 return LowerGlobalAddress(Op, DAG); 2361 case ISD::INTRINSIC_W_CHAIN: 2362 return Op; 2363 case ISD::BUILD_VECTOR: 2364 return LowerBUILD_VECTOR(Op, DAG); 2365 case ISD::EXTRACT_SUBVECTOR: 2366 return Op; 2367 case ISD::EXTRACT_VECTOR_ELT: 2368 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2369 case ISD::CONCAT_VECTORS: 2370 return LowerCONCAT_VECTORS(Op, DAG); 2371 case ISD::STORE: 2372 return LowerSTORE(Op, DAG); 2373 case ISD::LOAD: 2374 return LowerLOAD(Op, DAG); 2375 case ISD::SHL_PARTS: 2376 return LowerShiftLeftParts(Op, DAG); 2377 case ISD::SRA_PARTS: 2378 case ISD::SRL_PARTS: 2379 return LowerShiftRightParts(Op, DAG); 2380 case ISD::SELECT: 2381 return LowerSelect(Op, DAG); 2382 case ISD::FROUND: 2383 return LowerFROUND(Op, DAG); 2384 case ISD::VAARG: 2385 return LowerVAARG(Op, DAG); 2386 case ISD::VASTART: 2387 return LowerVASTART(Op, DAG); 2388 default: 2389 llvm_unreachable("Custom lowering not defined for operation"); 2390 } 2391 } 2392 2393 // This function is almost a copy of SelectionDAG::expandVAArg(). 2394 // The only diff is that this one produces loads from local address space. 2395 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2396 const TargetLowering *TLI = STI.getTargetLowering(); 2397 SDLoc DL(Op); 2398 2399 SDNode *Node = Op.getNode(); 2400 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2401 EVT VT = Node->getValueType(0); 2402 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2403 SDValue Tmp1 = Node->getOperand(0); 2404 SDValue Tmp2 = Node->getOperand(1); 2405 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2406 2407 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2408 Tmp1, Tmp2, MachinePointerInfo(V)); 2409 SDValue VAList = VAListLoad; 2410 2411 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2412 VAList = DAG.getNode( 2413 ISD::ADD, DL, VAList.getValueType(), VAList, 2414 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2415 2416 VAList = DAG.getNode( 2417 ISD::AND, DL, VAList.getValueType(), VAList, 2418 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2419 } 2420 2421 // Increment the pointer, VAList, to the next vaarg 2422 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2423 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2424 DL, VAList.getValueType())); 2425 2426 // Store the incremented VAList to the legalized pointer 2427 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2428 MachinePointerInfo(V)); 2429 2430 const Value *SrcV = 2431 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2432 2433 // Load the actual argument out of the pointer VAList 2434 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2435 } 2436 2437 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2438 const TargetLowering *TLI = STI.getTargetLowering(); 2439 SDLoc DL(Op); 2440 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2441 2442 // Store the address of unsized array <function>_vararg[] in the ap object. 2443 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2444 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2445 2446 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2447 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2448 MachinePointerInfo(SV)); 2449 } 2450 2451 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2452 SDValue Op0 = Op->getOperand(0); 2453 SDValue Op1 = Op->getOperand(1); 2454 SDValue Op2 = Op->getOperand(2); 2455 SDLoc DL(Op.getNode()); 2456 2457 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2458 2459 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2460 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2461 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2462 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2463 2464 return Trunc; 2465 } 2466 2467 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2468 if (Op.getValueType() == MVT::i1) 2469 return LowerLOADi1(Op, DAG); 2470 2471 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2472 // loads and have to handle it here. 2473 if (Isv2f16Orv2bf16Type(Op.getValueType())) { 2474 LoadSDNode *Load = cast<LoadSDNode>(Op); 2475 EVT MemVT = Load->getMemoryVT(); 2476 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2477 MemVT, *Load->getMemOperand())) { 2478 SDValue Ops[2]; 2479 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2480 return DAG.getMergeValues(Ops, SDLoc(Op)); 2481 } 2482 } 2483 2484 return SDValue(); 2485 } 2486 2487 // v = ld i1* addr 2488 // => 2489 // v1 = ld i8* addr (-> i16) 2490 // v = trunc i16 to i1 2491 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2492 SDNode *Node = Op.getNode(); 2493 LoadSDNode *LD = cast<LoadSDNode>(Node); 2494 SDLoc dl(Node); 2495 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2496 assert(Node->getValueType(0) == MVT::i1 && 2497 "Custom lowering for i1 load only"); 2498 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), 2499 LD->getPointerInfo(), LD->getAlign(), 2500 LD->getMemOperand()->getFlags()); 2501 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2502 // The legalizer (the caller) is expecting two values from the legalized 2503 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2504 // in LegalizeDAG.cpp which also uses MergeValues. 2505 SDValue Ops[] = { result, LD->getChain() }; 2506 return DAG.getMergeValues(Ops, dl); 2507 } 2508 2509 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2510 StoreSDNode *Store = cast<StoreSDNode>(Op); 2511 EVT VT = Store->getMemoryVT(); 2512 2513 if (VT == MVT::i1) 2514 return LowerSTOREi1(Op, DAG); 2515 2516 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2517 // stores and have to handle it here. 2518 if (Isv2f16Orv2bf16Type(VT) && 2519 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2520 VT, *Store->getMemOperand())) 2521 return expandUnalignedStore(Store, DAG); 2522 2523 // v2f16 and v2bf16 don't need special handling. 2524 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2525 return SDValue(); 2526 2527 if (VT.isVector()) 2528 return LowerSTOREVector(Op, DAG); 2529 2530 return SDValue(); 2531 } 2532 2533 SDValue 2534 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2535 SDNode *N = Op.getNode(); 2536 SDValue Val = N->getOperand(1); 2537 SDLoc DL(N); 2538 EVT ValVT = Val.getValueType(); 2539 2540 if (ValVT.isVector()) { 2541 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2542 // legal. We can (and should) split that into 2 stores of <2 x double> here 2543 // but I'm leaving that as a TODO for now. 2544 if (!ValVT.isSimple()) 2545 return SDValue(); 2546 switch (ValVT.getSimpleVT().SimpleTy) { 2547 default: 2548 return SDValue(); 2549 case MVT::v2i8: 2550 case MVT::v2i16: 2551 case MVT::v2i32: 2552 case MVT::v2i64: 2553 case MVT::v2f16: 2554 case MVT::v2bf16: 2555 case MVT::v2f32: 2556 case MVT::v2f64: 2557 case MVT::v4i8: 2558 case MVT::v4i16: 2559 case MVT::v4i32: 2560 case MVT::v4f16: 2561 case MVT::v4bf16: 2562 case MVT::v4f32: 2563 case MVT::v8f16: // <4 x f16x2> 2564 case MVT::v8bf16: // <4 x bf16x2> 2565 // This is a "native" vector type 2566 break; 2567 } 2568 2569 MemSDNode *MemSD = cast<MemSDNode>(N); 2570 const DataLayout &TD = DAG.getDataLayout(); 2571 2572 Align Alignment = MemSD->getAlign(); 2573 Align PrefAlign = 2574 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 2575 if (Alignment < PrefAlign) { 2576 // This store is not sufficiently aligned, so bail out and let this vector 2577 // store be scalarized. Note that we may still be able to emit smaller 2578 // vector stores. For example, if we are storing a <4 x float> with an 2579 // alignment of 8, this check will fail but the legalizer will try again 2580 // with 2 x <2 x float>, which will succeed with an alignment of 8. 2581 return SDValue(); 2582 } 2583 2584 unsigned Opcode = 0; 2585 EVT EltVT = ValVT.getVectorElementType(); 2586 unsigned NumElts = ValVT.getVectorNumElements(); 2587 2588 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2589 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 2590 // stored type to i16 and propagate the "real" type as the memory type. 2591 bool NeedExt = false; 2592 if (EltVT.getSizeInBits() < 16) 2593 NeedExt = true; 2594 2595 bool StoreF16x2 = false; 2596 switch (NumElts) { 2597 default: 2598 return SDValue(); 2599 case 2: 2600 Opcode = NVPTXISD::StoreV2; 2601 break; 2602 case 4: 2603 Opcode = NVPTXISD::StoreV4; 2604 break; 2605 case 8: 2606 // v8f16 is a special case. PTX doesn't have st.v8.f16 2607 // instruction. Instead, we split the vector into v2f16 chunks and 2608 // store them with st.v4.b32. 2609 assert(Isf16Orbf16Type(EltVT.getSimpleVT()) && 2610 "Wrong type for the vector."); 2611 Opcode = NVPTXISD::StoreV4; 2612 StoreF16x2 = true; 2613 break; 2614 } 2615 2616 SmallVector<SDValue, 8> Ops; 2617 2618 // First is the chain 2619 Ops.push_back(N->getOperand(0)); 2620 2621 if (StoreF16x2) { 2622 // Combine f16,f16 -> v2f16 2623 NumElts /= 2; 2624 for (unsigned i = 0; i < NumElts; ++i) { 2625 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2626 DAG.getIntPtrConstant(i * 2, DL)); 2627 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2628 DAG.getIntPtrConstant(i * 2 + 1, DL)); 2629 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 2630 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 2631 Ops.push_back(V2); 2632 } 2633 } else { 2634 // Then the split values 2635 for (unsigned i = 0; i < NumElts; ++i) { 2636 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 2637 DAG.getIntPtrConstant(i, DL)); 2638 if (NeedExt) 2639 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 2640 Ops.push_back(ExtVal); 2641 } 2642 } 2643 2644 // Then any remaining arguments 2645 Ops.append(N->op_begin() + 2, N->op_end()); 2646 2647 SDValue NewSt = 2648 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 2649 MemSD->getMemoryVT(), MemSD->getMemOperand()); 2650 2651 // return DCI.CombineTo(N, NewSt, true); 2652 return NewSt; 2653 } 2654 2655 return SDValue(); 2656 } 2657 2658 // st i1 v, addr 2659 // => 2660 // v1 = zxt v to i16 2661 // st.u8 i16, addr 2662 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 2663 SDNode *Node = Op.getNode(); 2664 SDLoc dl(Node); 2665 StoreSDNode *ST = cast<StoreSDNode>(Node); 2666 SDValue Tmp1 = ST->getChain(); 2667 SDValue Tmp2 = ST->getBasePtr(); 2668 SDValue Tmp3 = ST->getValue(); 2669 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 2670 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 2671 SDValue Result = 2672 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 2673 ST->getAlign(), ST->getMemOperand()->getFlags()); 2674 return Result; 2675 } 2676 2677 // This creates target external symbol for a function parameter. 2678 // Name of the symbol is composed from its index and the function name. 2679 // Negative index corresponds to special parameter (unsized array) used for 2680 // passing variable arguments. 2681 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 2682 EVT v) const { 2683 StringRef SavedStr = nvTM->getStrPool().save( 2684 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 2685 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 2686 } 2687 2688 SDValue NVPTXTargetLowering::LowerFormalArguments( 2689 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2690 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2691 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 2692 MachineFunction &MF = DAG.getMachineFunction(); 2693 const DataLayout &DL = DAG.getDataLayout(); 2694 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2695 2696 const Function *F = &MF.getFunction(); 2697 const AttributeList &PAL = F->getAttributes(); 2698 const TargetLowering *TLI = STI.getTargetLowering(); 2699 2700 SDValue Root = DAG.getRoot(); 2701 std::vector<SDValue> OutChains; 2702 2703 bool isABI = (STI.getSmVersion() >= 20); 2704 assert(isABI && "Non-ABI compilation is not supported"); 2705 if (!isABI) 2706 return Chain; 2707 2708 std::vector<Type *> argTypes; 2709 std::vector<const Argument *> theArgs; 2710 for (const Argument &I : F->args()) { 2711 theArgs.push_back(&I); 2712 argTypes.push_back(I.getType()); 2713 } 2714 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 2715 // Ins.size() will be larger 2716 // * if there is an aggregate argument with multiple fields (each field 2717 // showing up separately in Ins) 2718 // * if there is a vector argument with more than typical vector-length 2719 // elements (generally if more than 4) where each vector element is 2720 // individually present in Ins. 2721 // So a different index should be used for indexing into Ins. 2722 // See similar issue in LowerCall. 2723 unsigned InsIdx = 0; 2724 2725 int idx = 0; 2726 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) { 2727 Type *Ty = argTypes[i]; 2728 2729 if (theArgs[i]->use_empty()) { 2730 // argument is dead 2731 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 2732 SmallVector<EVT, 16> vtparts; 2733 2734 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 2735 if (vtparts.empty()) 2736 report_fatal_error("Empty parameter types are not supported"); 2737 2738 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 2739 ++parti) { 2740 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2741 ++InsIdx; 2742 } 2743 if (vtparts.size() > 0) 2744 --InsIdx; 2745 continue; 2746 } 2747 if (Ty->isVectorTy()) { 2748 EVT ObjectVT = getValueType(DL, Ty); 2749 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 2750 for (unsigned parti = 0; parti < NumRegs; ++parti) { 2751 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2752 ++InsIdx; 2753 } 2754 if (NumRegs > 0) 2755 --InsIdx; 2756 continue; 2757 } 2758 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 2759 continue; 2760 } 2761 2762 // In the following cases, assign a node order of "idx+1" 2763 // to newly created nodes. The SDNodes for params have to 2764 // appear in the same order as their order of appearance 2765 // in the original function. "idx+1" holds that order. 2766 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 2767 bool aggregateIsPacked = false; 2768 if (StructType *STy = dyn_cast<StructType>(Ty)) 2769 aggregateIsPacked = STy->isPacked(); 2770 2771 SmallVector<EVT, 16> VTs; 2772 SmallVector<uint64_t, 16> Offsets; 2773 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 2774 if (VTs.empty()) 2775 report_fatal_error("Empty parameter types are not supported"); 2776 2777 auto VectorInfo = 2778 VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty)); 2779 2780 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2781 int VecIdx = -1; // Index of the first element of the current vector. 2782 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 2783 if (VectorInfo[parti] & PVF_FIRST) { 2784 assert(VecIdx == -1 && "Orphaned vector."); 2785 VecIdx = parti; 2786 } 2787 2788 // That's the last element of this store op. 2789 if (VectorInfo[parti] & PVF_LAST) { 2790 unsigned NumElts = parti - VecIdx + 1; 2791 EVT EltVT = VTs[parti]; 2792 // i1 is loaded/stored as i8. 2793 EVT LoadVT = EltVT; 2794 if (EltVT == MVT::i1) 2795 LoadVT = MVT::i8; 2796 else if (Isv2f16Orv2bf16Type(EltVT)) 2797 // getLoad needs a vector type, but it can't handle 2798 // vectors which contain v2f16 or v2bf16 elements. So we must load 2799 // using i32 here and then bitcast back. 2800 LoadVT = MVT::i32; 2801 2802 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 2803 SDValue VecAddr = 2804 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 2805 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 2806 Value *srcValue = Constant::getNullValue(PointerType::get( 2807 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 2808 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 2809 MachinePointerInfo(srcValue), 2810 MaybeAlign(aggregateIsPacked ? 1 : 0), 2811 MachineMemOperand::MODereferenceable | 2812 MachineMemOperand::MOInvariant); 2813 if (P.getNode()) 2814 P.getNode()->setIROrder(idx + 1); 2815 for (unsigned j = 0; j < NumElts; ++j) { 2816 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 2817 DAG.getIntPtrConstant(j, dl)); 2818 // We've loaded i1 as an i8 and now must truncate it back to i1 2819 if (EltVT == MVT::i1) 2820 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 2821 // v2f16 was loaded as an i32. Now we must bitcast it back. 2822 else if (Isv2f16Orv2bf16Type(EltVT)) 2823 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 2824 2825 // If a promoted integer type is used, truncate down to the original 2826 MVT PromotedVT; 2827 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 2828 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 2829 } 2830 2831 // Extend the element if necessary (e.g. an i8 is loaded 2832 // into an i16 register) 2833 if (Ins[InsIdx].VT.isInteger() && 2834 Ins[InsIdx].VT.getFixedSizeInBits() > 2835 LoadVT.getFixedSizeInBits()) { 2836 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 2837 : ISD::ZERO_EXTEND; 2838 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 2839 } 2840 InVals.push_back(Elt); 2841 } 2842 2843 // Reset vector tracking state. 2844 VecIdx = -1; 2845 } 2846 ++InsIdx; 2847 } 2848 if (VTs.size() > 0) 2849 --InsIdx; 2850 continue; 2851 } 2852 2853 // Param has ByVal attribute 2854 // Return MoveParam(param symbol). 2855 // Ideally, the param symbol can be returned directly, 2856 // but when SDNode builder decides to use it in a CopyToReg(), 2857 // machine instruction fails because TargetExternalSymbol 2858 // (not lowered) is target dependent, and CopyToReg assumes 2859 // the source is lowered. 2860 EVT ObjectVT = getValueType(DL, Ty); 2861 assert(ObjectVT == Ins[InsIdx].VT && 2862 "Ins type did not match function type"); 2863 SDValue Arg = getParamSymbol(DAG, idx, PtrVT); 2864 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 2865 if (p.getNode()) 2866 p.getNode()->setIROrder(idx + 1); 2867 InVals.push_back(p); 2868 } 2869 2870 if (!OutChains.empty()) 2871 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 2872 2873 return Chain; 2874 } 2875 2876 SDValue 2877 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2878 bool isVarArg, 2879 const SmallVectorImpl<ISD::OutputArg> &Outs, 2880 const SmallVectorImpl<SDValue> &OutVals, 2881 const SDLoc &dl, SelectionDAG &DAG) const { 2882 const MachineFunction &MF = DAG.getMachineFunction(); 2883 const Function &F = MF.getFunction(); 2884 Type *RetTy = MF.getFunction().getReturnType(); 2885 2886 bool isABI = (STI.getSmVersion() >= 20); 2887 assert(isABI && "Non-ABI compilation is not supported"); 2888 if (!isABI) 2889 return Chain; 2890 2891 const DataLayout &DL = DAG.getDataLayout(); 2892 SmallVector<SDValue, 16> PromotedOutVals; 2893 SmallVector<EVT, 16> VTs; 2894 SmallVector<uint64_t, 16> Offsets; 2895 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 2896 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 2897 2898 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2899 SDValue PromotedOutVal = OutVals[i]; 2900 MVT PromotedVT; 2901 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 2902 VTs[i] = EVT(PromotedVT); 2903 } 2904 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 2905 llvm::ISD::NodeType Ext = 2906 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 2907 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 2908 } 2909 PromotedOutVals.push_back(PromotedOutVal); 2910 } 2911 2912 auto VectorInfo = VectorizePTXValueVTs( 2913 VTs, Offsets, 2914 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 2915 : Align(1)); 2916 2917 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2918 // 32-bits are sign extended or zero extended, depending on whether 2919 // they are signed or unsigned types. 2920 bool ExtendIntegerRetVal = 2921 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2922 2923 SmallVector<SDValue, 6> StoreOperands; 2924 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2925 // New load/store. Record chain and offset operands. 2926 if (VectorInfo[i] & PVF_FIRST) { 2927 assert(StoreOperands.empty() && "Orphaned operand list."); 2928 StoreOperands.push_back(Chain); 2929 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 2930 } 2931 2932 SDValue OutVal = OutVals[i]; 2933 SDValue RetVal = PromotedOutVals[i]; 2934 2935 if (ExtendIntegerRetVal) { 2936 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 2937 : ISD::ZERO_EXTEND, 2938 dl, MVT::i32, RetVal); 2939 } else if (OutVal.getValueSizeInBits() < 16) { 2940 // Use 16-bit registers for small load-stores as it's the 2941 // smallest general purpose register size supported by NVPTX. 2942 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 2943 } 2944 2945 // Record the value to return. 2946 StoreOperands.push_back(RetVal); 2947 2948 // That's the last element of this store op. 2949 if (VectorInfo[i] & PVF_LAST) { 2950 NVPTXISD::NodeType Op; 2951 unsigned NumElts = StoreOperands.size() - 2; 2952 switch (NumElts) { 2953 case 1: 2954 Op = NVPTXISD::StoreRetval; 2955 break; 2956 case 2: 2957 Op = NVPTXISD::StoreRetvalV2; 2958 break; 2959 case 4: 2960 Op = NVPTXISD::StoreRetvalV4; 2961 break; 2962 default: 2963 llvm_unreachable("Invalid vector info."); 2964 } 2965 2966 // Adjust type of load/store op if we've extended the scalar 2967 // return value. 2968 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 2969 Chain = DAG.getMemIntrinsicNode( 2970 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 2971 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 2972 // Cleanup vector state. 2973 StoreOperands.clear(); 2974 } 2975 } 2976 2977 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 2978 } 2979 2980 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 2981 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 2982 SelectionDAG &DAG) const { 2983 if (Constraint.length() > 1) 2984 return; 2985 else 2986 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 2987 } 2988 2989 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 2990 switch (Intrinsic) { 2991 default: 2992 return 0; 2993 2994 case Intrinsic::nvvm_tex_1d_v4f32_s32: 2995 return NVPTXISD::Tex1DFloatS32; 2996 case Intrinsic::nvvm_tex_1d_v4f32_f32: 2997 return NVPTXISD::Tex1DFloatFloat; 2998 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 2999 return NVPTXISD::Tex1DFloatFloatLevel; 3000 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3001 return NVPTXISD::Tex1DFloatFloatGrad; 3002 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3003 return NVPTXISD::Tex1DS32S32; 3004 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3005 return NVPTXISD::Tex1DS32Float; 3006 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3007 return NVPTXISD::Tex1DS32FloatLevel; 3008 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3009 return NVPTXISD::Tex1DS32FloatGrad; 3010 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3011 return NVPTXISD::Tex1DU32S32; 3012 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3013 return NVPTXISD::Tex1DU32Float; 3014 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3015 return NVPTXISD::Tex1DU32FloatLevel; 3016 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3017 return NVPTXISD::Tex1DU32FloatGrad; 3018 3019 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3020 return NVPTXISD::Tex1DArrayFloatS32; 3021 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3022 return NVPTXISD::Tex1DArrayFloatFloat; 3023 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3024 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3025 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3026 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3027 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3028 return NVPTXISD::Tex1DArrayS32S32; 3029 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3030 return NVPTXISD::Tex1DArrayS32Float; 3031 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3032 return NVPTXISD::Tex1DArrayS32FloatLevel; 3033 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3034 return NVPTXISD::Tex1DArrayS32FloatGrad; 3035 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3036 return NVPTXISD::Tex1DArrayU32S32; 3037 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3038 return NVPTXISD::Tex1DArrayU32Float; 3039 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3040 return NVPTXISD::Tex1DArrayU32FloatLevel; 3041 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3042 return NVPTXISD::Tex1DArrayU32FloatGrad; 3043 3044 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3045 return NVPTXISD::Tex2DFloatS32; 3046 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3047 return NVPTXISD::Tex2DFloatFloat; 3048 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3049 return NVPTXISD::Tex2DFloatFloatLevel; 3050 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3051 return NVPTXISD::Tex2DFloatFloatGrad; 3052 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3053 return NVPTXISD::Tex2DS32S32; 3054 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3055 return NVPTXISD::Tex2DS32Float; 3056 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3057 return NVPTXISD::Tex2DS32FloatLevel; 3058 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3059 return NVPTXISD::Tex2DS32FloatGrad; 3060 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3061 return NVPTXISD::Tex2DU32S32; 3062 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3063 return NVPTXISD::Tex2DU32Float; 3064 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3065 return NVPTXISD::Tex2DU32FloatLevel; 3066 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3067 return NVPTXISD::Tex2DU32FloatGrad; 3068 3069 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3070 return NVPTXISD::Tex2DArrayFloatS32; 3071 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3072 return NVPTXISD::Tex2DArrayFloatFloat; 3073 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3074 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3075 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3076 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3077 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3078 return NVPTXISD::Tex2DArrayS32S32; 3079 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3080 return NVPTXISD::Tex2DArrayS32Float; 3081 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3082 return NVPTXISD::Tex2DArrayS32FloatLevel; 3083 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3084 return NVPTXISD::Tex2DArrayS32FloatGrad; 3085 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3086 return NVPTXISD::Tex2DArrayU32S32; 3087 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3088 return NVPTXISD::Tex2DArrayU32Float; 3089 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3090 return NVPTXISD::Tex2DArrayU32FloatLevel; 3091 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3092 return NVPTXISD::Tex2DArrayU32FloatGrad; 3093 3094 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3095 return NVPTXISD::Tex3DFloatS32; 3096 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3097 return NVPTXISD::Tex3DFloatFloat; 3098 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3099 return NVPTXISD::Tex3DFloatFloatLevel; 3100 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3101 return NVPTXISD::Tex3DFloatFloatGrad; 3102 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3103 return NVPTXISD::Tex3DS32S32; 3104 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3105 return NVPTXISD::Tex3DS32Float; 3106 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3107 return NVPTXISD::Tex3DS32FloatLevel; 3108 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3109 return NVPTXISD::Tex3DS32FloatGrad; 3110 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3111 return NVPTXISD::Tex3DU32S32; 3112 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3113 return NVPTXISD::Tex3DU32Float; 3114 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3115 return NVPTXISD::Tex3DU32FloatLevel; 3116 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3117 return NVPTXISD::Tex3DU32FloatGrad; 3118 3119 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3120 return NVPTXISD::TexCubeFloatFloat; 3121 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3122 return NVPTXISD::TexCubeFloatFloatLevel; 3123 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3124 return NVPTXISD::TexCubeS32Float; 3125 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3126 return NVPTXISD::TexCubeS32FloatLevel; 3127 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3128 return NVPTXISD::TexCubeU32Float; 3129 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3130 return NVPTXISD::TexCubeU32FloatLevel; 3131 3132 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3133 return NVPTXISD::TexCubeArrayFloatFloat; 3134 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3135 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3136 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3137 return NVPTXISD::TexCubeArrayS32Float; 3138 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3139 return NVPTXISD::TexCubeArrayS32FloatLevel; 3140 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3141 return NVPTXISD::TexCubeArrayU32Float; 3142 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3143 return NVPTXISD::TexCubeArrayU32FloatLevel; 3144 3145 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3146 return NVPTXISD::Tld4R2DFloatFloat; 3147 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3148 return NVPTXISD::Tld4G2DFloatFloat; 3149 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3150 return NVPTXISD::Tld4B2DFloatFloat; 3151 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3152 return NVPTXISD::Tld4A2DFloatFloat; 3153 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3154 return NVPTXISD::Tld4R2DS64Float; 3155 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3156 return NVPTXISD::Tld4G2DS64Float; 3157 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3158 return NVPTXISD::Tld4B2DS64Float; 3159 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3160 return NVPTXISD::Tld4A2DS64Float; 3161 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3162 return NVPTXISD::Tld4R2DU64Float; 3163 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3164 return NVPTXISD::Tld4G2DU64Float; 3165 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3166 return NVPTXISD::Tld4B2DU64Float; 3167 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3168 return NVPTXISD::Tld4A2DU64Float; 3169 3170 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3171 return NVPTXISD::TexUnified1DFloatS32; 3172 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3173 return NVPTXISD::TexUnified1DFloatFloat; 3174 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3175 return NVPTXISD::TexUnified1DFloatFloatLevel; 3176 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3177 return NVPTXISD::TexUnified1DFloatFloatGrad; 3178 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3179 return NVPTXISD::TexUnified1DS32S32; 3180 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3181 return NVPTXISD::TexUnified1DS32Float; 3182 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3183 return NVPTXISD::TexUnified1DS32FloatLevel; 3184 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3185 return NVPTXISD::TexUnified1DS32FloatGrad; 3186 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3187 return NVPTXISD::TexUnified1DU32S32; 3188 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3189 return NVPTXISD::TexUnified1DU32Float; 3190 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3191 return NVPTXISD::TexUnified1DU32FloatLevel; 3192 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3193 return NVPTXISD::TexUnified1DU32FloatGrad; 3194 3195 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3196 return NVPTXISD::TexUnified1DArrayFloatS32; 3197 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3198 return NVPTXISD::TexUnified1DArrayFloatFloat; 3199 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3200 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3201 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3202 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3203 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3204 return NVPTXISD::TexUnified1DArrayS32S32; 3205 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3206 return NVPTXISD::TexUnified1DArrayS32Float; 3207 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3208 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3209 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3210 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3211 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3212 return NVPTXISD::TexUnified1DArrayU32S32; 3213 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3214 return NVPTXISD::TexUnified1DArrayU32Float; 3215 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3216 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3217 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3218 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3219 3220 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3221 return NVPTXISD::TexUnified2DFloatS32; 3222 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3223 return NVPTXISD::TexUnified2DFloatFloat; 3224 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3225 return NVPTXISD::TexUnified2DFloatFloatLevel; 3226 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3227 return NVPTXISD::TexUnified2DFloatFloatGrad; 3228 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3229 return NVPTXISD::TexUnified2DS32S32; 3230 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3231 return NVPTXISD::TexUnified2DS32Float; 3232 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3233 return NVPTXISD::TexUnified2DS32FloatLevel; 3234 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3235 return NVPTXISD::TexUnified2DS32FloatGrad; 3236 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3237 return NVPTXISD::TexUnified2DU32S32; 3238 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3239 return NVPTXISD::TexUnified2DU32Float; 3240 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3241 return NVPTXISD::TexUnified2DU32FloatLevel; 3242 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3243 return NVPTXISD::TexUnified2DU32FloatGrad; 3244 3245 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3246 return NVPTXISD::TexUnified2DArrayFloatS32; 3247 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3248 return NVPTXISD::TexUnified2DArrayFloatFloat; 3249 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3250 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3251 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3252 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3253 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3254 return NVPTXISD::TexUnified2DArrayS32S32; 3255 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3256 return NVPTXISD::TexUnified2DArrayS32Float; 3257 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3258 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3259 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3260 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3261 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3262 return NVPTXISD::TexUnified2DArrayU32S32; 3263 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3264 return NVPTXISD::TexUnified2DArrayU32Float; 3265 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3266 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3267 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3268 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3269 3270 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3271 return NVPTXISD::TexUnified3DFloatS32; 3272 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3273 return NVPTXISD::TexUnified3DFloatFloat; 3274 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3275 return NVPTXISD::TexUnified3DFloatFloatLevel; 3276 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3277 return NVPTXISD::TexUnified3DFloatFloatGrad; 3278 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3279 return NVPTXISD::TexUnified3DS32S32; 3280 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3281 return NVPTXISD::TexUnified3DS32Float; 3282 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3283 return NVPTXISD::TexUnified3DS32FloatLevel; 3284 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3285 return NVPTXISD::TexUnified3DS32FloatGrad; 3286 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3287 return NVPTXISD::TexUnified3DU32S32; 3288 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3289 return NVPTXISD::TexUnified3DU32Float; 3290 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3291 return NVPTXISD::TexUnified3DU32FloatLevel; 3292 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3293 return NVPTXISD::TexUnified3DU32FloatGrad; 3294 3295 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3296 return NVPTXISD::TexUnifiedCubeFloatFloat; 3297 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3298 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3299 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3300 return NVPTXISD::TexUnifiedCubeS32Float; 3301 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3302 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3303 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3304 return NVPTXISD::TexUnifiedCubeU32Float; 3305 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3306 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3307 3308 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3309 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3310 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3311 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3312 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3313 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3314 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3315 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3316 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3317 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3318 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3319 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3320 3321 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3322 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3323 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3324 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3325 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3326 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3327 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3328 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3329 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3330 return NVPTXISD::Tld4UnifiedR2DS64Float; 3331 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3332 return NVPTXISD::Tld4UnifiedG2DS64Float; 3333 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3334 return NVPTXISD::Tld4UnifiedB2DS64Float; 3335 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3336 return NVPTXISD::Tld4UnifiedA2DS64Float; 3337 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3338 return NVPTXISD::Tld4UnifiedR2DU64Float; 3339 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3340 return NVPTXISD::Tld4UnifiedG2DU64Float; 3341 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3342 return NVPTXISD::Tld4UnifiedB2DU64Float; 3343 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3344 return NVPTXISD::Tld4UnifiedA2DU64Float; 3345 } 3346 } 3347 3348 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3349 switch (Intrinsic) { 3350 default: 3351 return 0; 3352 case Intrinsic::nvvm_suld_1d_i8_clamp: 3353 return NVPTXISD::Suld1DI8Clamp; 3354 case Intrinsic::nvvm_suld_1d_i16_clamp: 3355 return NVPTXISD::Suld1DI16Clamp; 3356 case Intrinsic::nvvm_suld_1d_i32_clamp: 3357 return NVPTXISD::Suld1DI32Clamp; 3358 case Intrinsic::nvvm_suld_1d_i64_clamp: 3359 return NVPTXISD::Suld1DI64Clamp; 3360 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3361 return NVPTXISD::Suld1DV2I8Clamp; 3362 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3363 return NVPTXISD::Suld1DV2I16Clamp; 3364 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3365 return NVPTXISD::Suld1DV2I32Clamp; 3366 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3367 return NVPTXISD::Suld1DV2I64Clamp; 3368 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3369 return NVPTXISD::Suld1DV4I8Clamp; 3370 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3371 return NVPTXISD::Suld1DV4I16Clamp; 3372 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3373 return NVPTXISD::Suld1DV4I32Clamp; 3374 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3375 return NVPTXISD::Suld1DArrayI8Clamp; 3376 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3377 return NVPTXISD::Suld1DArrayI16Clamp; 3378 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3379 return NVPTXISD::Suld1DArrayI32Clamp; 3380 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3381 return NVPTXISD::Suld1DArrayI64Clamp; 3382 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3383 return NVPTXISD::Suld1DArrayV2I8Clamp; 3384 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3385 return NVPTXISD::Suld1DArrayV2I16Clamp; 3386 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3387 return NVPTXISD::Suld1DArrayV2I32Clamp; 3388 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3389 return NVPTXISD::Suld1DArrayV2I64Clamp; 3390 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3391 return NVPTXISD::Suld1DArrayV4I8Clamp; 3392 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3393 return NVPTXISD::Suld1DArrayV4I16Clamp; 3394 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3395 return NVPTXISD::Suld1DArrayV4I32Clamp; 3396 case Intrinsic::nvvm_suld_2d_i8_clamp: 3397 return NVPTXISD::Suld2DI8Clamp; 3398 case Intrinsic::nvvm_suld_2d_i16_clamp: 3399 return NVPTXISD::Suld2DI16Clamp; 3400 case Intrinsic::nvvm_suld_2d_i32_clamp: 3401 return NVPTXISD::Suld2DI32Clamp; 3402 case Intrinsic::nvvm_suld_2d_i64_clamp: 3403 return NVPTXISD::Suld2DI64Clamp; 3404 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3405 return NVPTXISD::Suld2DV2I8Clamp; 3406 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3407 return NVPTXISD::Suld2DV2I16Clamp; 3408 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3409 return NVPTXISD::Suld2DV2I32Clamp; 3410 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3411 return NVPTXISD::Suld2DV2I64Clamp; 3412 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3413 return NVPTXISD::Suld2DV4I8Clamp; 3414 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3415 return NVPTXISD::Suld2DV4I16Clamp; 3416 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3417 return NVPTXISD::Suld2DV4I32Clamp; 3418 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3419 return NVPTXISD::Suld2DArrayI8Clamp; 3420 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3421 return NVPTXISD::Suld2DArrayI16Clamp; 3422 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3423 return NVPTXISD::Suld2DArrayI32Clamp; 3424 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3425 return NVPTXISD::Suld2DArrayI64Clamp; 3426 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3427 return NVPTXISD::Suld2DArrayV2I8Clamp; 3428 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3429 return NVPTXISD::Suld2DArrayV2I16Clamp; 3430 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3431 return NVPTXISD::Suld2DArrayV2I32Clamp; 3432 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3433 return NVPTXISD::Suld2DArrayV2I64Clamp; 3434 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3435 return NVPTXISD::Suld2DArrayV4I8Clamp; 3436 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3437 return NVPTXISD::Suld2DArrayV4I16Clamp; 3438 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3439 return NVPTXISD::Suld2DArrayV4I32Clamp; 3440 case Intrinsic::nvvm_suld_3d_i8_clamp: 3441 return NVPTXISD::Suld3DI8Clamp; 3442 case Intrinsic::nvvm_suld_3d_i16_clamp: 3443 return NVPTXISD::Suld3DI16Clamp; 3444 case Intrinsic::nvvm_suld_3d_i32_clamp: 3445 return NVPTXISD::Suld3DI32Clamp; 3446 case Intrinsic::nvvm_suld_3d_i64_clamp: 3447 return NVPTXISD::Suld3DI64Clamp; 3448 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3449 return NVPTXISD::Suld3DV2I8Clamp; 3450 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3451 return NVPTXISD::Suld3DV2I16Clamp; 3452 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3453 return NVPTXISD::Suld3DV2I32Clamp; 3454 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3455 return NVPTXISD::Suld3DV2I64Clamp; 3456 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 3457 return NVPTXISD::Suld3DV4I8Clamp; 3458 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 3459 return NVPTXISD::Suld3DV4I16Clamp; 3460 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 3461 return NVPTXISD::Suld3DV4I32Clamp; 3462 case Intrinsic::nvvm_suld_1d_i8_trap: 3463 return NVPTXISD::Suld1DI8Trap; 3464 case Intrinsic::nvvm_suld_1d_i16_trap: 3465 return NVPTXISD::Suld1DI16Trap; 3466 case Intrinsic::nvvm_suld_1d_i32_trap: 3467 return NVPTXISD::Suld1DI32Trap; 3468 case Intrinsic::nvvm_suld_1d_i64_trap: 3469 return NVPTXISD::Suld1DI64Trap; 3470 case Intrinsic::nvvm_suld_1d_v2i8_trap: 3471 return NVPTXISD::Suld1DV2I8Trap; 3472 case Intrinsic::nvvm_suld_1d_v2i16_trap: 3473 return NVPTXISD::Suld1DV2I16Trap; 3474 case Intrinsic::nvvm_suld_1d_v2i32_trap: 3475 return NVPTXISD::Suld1DV2I32Trap; 3476 case Intrinsic::nvvm_suld_1d_v2i64_trap: 3477 return NVPTXISD::Suld1DV2I64Trap; 3478 case Intrinsic::nvvm_suld_1d_v4i8_trap: 3479 return NVPTXISD::Suld1DV4I8Trap; 3480 case Intrinsic::nvvm_suld_1d_v4i16_trap: 3481 return NVPTXISD::Suld1DV4I16Trap; 3482 case Intrinsic::nvvm_suld_1d_v4i32_trap: 3483 return NVPTXISD::Suld1DV4I32Trap; 3484 case Intrinsic::nvvm_suld_1d_array_i8_trap: 3485 return NVPTXISD::Suld1DArrayI8Trap; 3486 case Intrinsic::nvvm_suld_1d_array_i16_trap: 3487 return NVPTXISD::Suld1DArrayI16Trap; 3488 case Intrinsic::nvvm_suld_1d_array_i32_trap: 3489 return NVPTXISD::Suld1DArrayI32Trap; 3490 case Intrinsic::nvvm_suld_1d_array_i64_trap: 3491 return NVPTXISD::Suld1DArrayI64Trap; 3492 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 3493 return NVPTXISD::Suld1DArrayV2I8Trap; 3494 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 3495 return NVPTXISD::Suld1DArrayV2I16Trap; 3496 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 3497 return NVPTXISD::Suld1DArrayV2I32Trap; 3498 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 3499 return NVPTXISD::Suld1DArrayV2I64Trap; 3500 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 3501 return NVPTXISD::Suld1DArrayV4I8Trap; 3502 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 3503 return NVPTXISD::Suld1DArrayV4I16Trap; 3504 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 3505 return NVPTXISD::Suld1DArrayV4I32Trap; 3506 case Intrinsic::nvvm_suld_2d_i8_trap: 3507 return NVPTXISD::Suld2DI8Trap; 3508 case Intrinsic::nvvm_suld_2d_i16_trap: 3509 return NVPTXISD::Suld2DI16Trap; 3510 case Intrinsic::nvvm_suld_2d_i32_trap: 3511 return NVPTXISD::Suld2DI32Trap; 3512 case Intrinsic::nvvm_suld_2d_i64_trap: 3513 return NVPTXISD::Suld2DI64Trap; 3514 case Intrinsic::nvvm_suld_2d_v2i8_trap: 3515 return NVPTXISD::Suld2DV2I8Trap; 3516 case Intrinsic::nvvm_suld_2d_v2i16_trap: 3517 return NVPTXISD::Suld2DV2I16Trap; 3518 case Intrinsic::nvvm_suld_2d_v2i32_trap: 3519 return NVPTXISD::Suld2DV2I32Trap; 3520 case Intrinsic::nvvm_suld_2d_v2i64_trap: 3521 return NVPTXISD::Suld2DV2I64Trap; 3522 case Intrinsic::nvvm_suld_2d_v4i8_trap: 3523 return NVPTXISD::Suld2DV4I8Trap; 3524 case Intrinsic::nvvm_suld_2d_v4i16_trap: 3525 return NVPTXISD::Suld2DV4I16Trap; 3526 case Intrinsic::nvvm_suld_2d_v4i32_trap: 3527 return NVPTXISD::Suld2DV4I32Trap; 3528 case Intrinsic::nvvm_suld_2d_array_i8_trap: 3529 return NVPTXISD::Suld2DArrayI8Trap; 3530 case Intrinsic::nvvm_suld_2d_array_i16_trap: 3531 return NVPTXISD::Suld2DArrayI16Trap; 3532 case Intrinsic::nvvm_suld_2d_array_i32_trap: 3533 return NVPTXISD::Suld2DArrayI32Trap; 3534 case Intrinsic::nvvm_suld_2d_array_i64_trap: 3535 return NVPTXISD::Suld2DArrayI64Trap; 3536 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 3537 return NVPTXISD::Suld2DArrayV2I8Trap; 3538 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 3539 return NVPTXISD::Suld2DArrayV2I16Trap; 3540 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 3541 return NVPTXISD::Suld2DArrayV2I32Trap; 3542 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 3543 return NVPTXISD::Suld2DArrayV2I64Trap; 3544 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 3545 return NVPTXISD::Suld2DArrayV4I8Trap; 3546 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 3547 return NVPTXISD::Suld2DArrayV4I16Trap; 3548 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 3549 return NVPTXISD::Suld2DArrayV4I32Trap; 3550 case Intrinsic::nvvm_suld_3d_i8_trap: 3551 return NVPTXISD::Suld3DI8Trap; 3552 case Intrinsic::nvvm_suld_3d_i16_trap: 3553 return NVPTXISD::Suld3DI16Trap; 3554 case Intrinsic::nvvm_suld_3d_i32_trap: 3555 return NVPTXISD::Suld3DI32Trap; 3556 case Intrinsic::nvvm_suld_3d_i64_trap: 3557 return NVPTXISD::Suld3DI64Trap; 3558 case Intrinsic::nvvm_suld_3d_v2i8_trap: 3559 return NVPTXISD::Suld3DV2I8Trap; 3560 case Intrinsic::nvvm_suld_3d_v2i16_trap: 3561 return NVPTXISD::Suld3DV2I16Trap; 3562 case Intrinsic::nvvm_suld_3d_v2i32_trap: 3563 return NVPTXISD::Suld3DV2I32Trap; 3564 case Intrinsic::nvvm_suld_3d_v2i64_trap: 3565 return NVPTXISD::Suld3DV2I64Trap; 3566 case Intrinsic::nvvm_suld_3d_v4i8_trap: 3567 return NVPTXISD::Suld3DV4I8Trap; 3568 case Intrinsic::nvvm_suld_3d_v4i16_trap: 3569 return NVPTXISD::Suld3DV4I16Trap; 3570 case Intrinsic::nvvm_suld_3d_v4i32_trap: 3571 return NVPTXISD::Suld3DV4I32Trap; 3572 case Intrinsic::nvvm_suld_1d_i8_zero: 3573 return NVPTXISD::Suld1DI8Zero; 3574 case Intrinsic::nvvm_suld_1d_i16_zero: 3575 return NVPTXISD::Suld1DI16Zero; 3576 case Intrinsic::nvvm_suld_1d_i32_zero: 3577 return NVPTXISD::Suld1DI32Zero; 3578 case Intrinsic::nvvm_suld_1d_i64_zero: 3579 return NVPTXISD::Suld1DI64Zero; 3580 case Intrinsic::nvvm_suld_1d_v2i8_zero: 3581 return NVPTXISD::Suld1DV2I8Zero; 3582 case Intrinsic::nvvm_suld_1d_v2i16_zero: 3583 return NVPTXISD::Suld1DV2I16Zero; 3584 case Intrinsic::nvvm_suld_1d_v2i32_zero: 3585 return NVPTXISD::Suld1DV2I32Zero; 3586 case Intrinsic::nvvm_suld_1d_v2i64_zero: 3587 return NVPTXISD::Suld1DV2I64Zero; 3588 case Intrinsic::nvvm_suld_1d_v4i8_zero: 3589 return NVPTXISD::Suld1DV4I8Zero; 3590 case Intrinsic::nvvm_suld_1d_v4i16_zero: 3591 return NVPTXISD::Suld1DV4I16Zero; 3592 case Intrinsic::nvvm_suld_1d_v4i32_zero: 3593 return NVPTXISD::Suld1DV4I32Zero; 3594 case Intrinsic::nvvm_suld_1d_array_i8_zero: 3595 return NVPTXISD::Suld1DArrayI8Zero; 3596 case Intrinsic::nvvm_suld_1d_array_i16_zero: 3597 return NVPTXISD::Suld1DArrayI16Zero; 3598 case Intrinsic::nvvm_suld_1d_array_i32_zero: 3599 return NVPTXISD::Suld1DArrayI32Zero; 3600 case Intrinsic::nvvm_suld_1d_array_i64_zero: 3601 return NVPTXISD::Suld1DArrayI64Zero; 3602 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 3603 return NVPTXISD::Suld1DArrayV2I8Zero; 3604 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 3605 return NVPTXISD::Suld1DArrayV2I16Zero; 3606 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 3607 return NVPTXISD::Suld1DArrayV2I32Zero; 3608 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 3609 return NVPTXISD::Suld1DArrayV2I64Zero; 3610 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 3611 return NVPTXISD::Suld1DArrayV4I8Zero; 3612 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 3613 return NVPTXISD::Suld1DArrayV4I16Zero; 3614 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 3615 return NVPTXISD::Suld1DArrayV4I32Zero; 3616 case Intrinsic::nvvm_suld_2d_i8_zero: 3617 return NVPTXISD::Suld2DI8Zero; 3618 case Intrinsic::nvvm_suld_2d_i16_zero: 3619 return NVPTXISD::Suld2DI16Zero; 3620 case Intrinsic::nvvm_suld_2d_i32_zero: 3621 return NVPTXISD::Suld2DI32Zero; 3622 case Intrinsic::nvvm_suld_2d_i64_zero: 3623 return NVPTXISD::Suld2DI64Zero; 3624 case Intrinsic::nvvm_suld_2d_v2i8_zero: 3625 return NVPTXISD::Suld2DV2I8Zero; 3626 case Intrinsic::nvvm_suld_2d_v2i16_zero: 3627 return NVPTXISD::Suld2DV2I16Zero; 3628 case Intrinsic::nvvm_suld_2d_v2i32_zero: 3629 return NVPTXISD::Suld2DV2I32Zero; 3630 case Intrinsic::nvvm_suld_2d_v2i64_zero: 3631 return NVPTXISD::Suld2DV2I64Zero; 3632 case Intrinsic::nvvm_suld_2d_v4i8_zero: 3633 return NVPTXISD::Suld2DV4I8Zero; 3634 case Intrinsic::nvvm_suld_2d_v4i16_zero: 3635 return NVPTXISD::Suld2DV4I16Zero; 3636 case Intrinsic::nvvm_suld_2d_v4i32_zero: 3637 return NVPTXISD::Suld2DV4I32Zero; 3638 case Intrinsic::nvvm_suld_2d_array_i8_zero: 3639 return NVPTXISD::Suld2DArrayI8Zero; 3640 case Intrinsic::nvvm_suld_2d_array_i16_zero: 3641 return NVPTXISD::Suld2DArrayI16Zero; 3642 case Intrinsic::nvvm_suld_2d_array_i32_zero: 3643 return NVPTXISD::Suld2DArrayI32Zero; 3644 case Intrinsic::nvvm_suld_2d_array_i64_zero: 3645 return NVPTXISD::Suld2DArrayI64Zero; 3646 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 3647 return NVPTXISD::Suld2DArrayV2I8Zero; 3648 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 3649 return NVPTXISD::Suld2DArrayV2I16Zero; 3650 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 3651 return NVPTXISD::Suld2DArrayV2I32Zero; 3652 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 3653 return NVPTXISD::Suld2DArrayV2I64Zero; 3654 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 3655 return NVPTXISD::Suld2DArrayV4I8Zero; 3656 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 3657 return NVPTXISD::Suld2DArrayV4I16Zero; 3658 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 3659 return NVPTXISD::Suld2DArrayV4I32Zero; 3660 case Intrinsic::nvvm_suld_3d_i8_zero: 3661 return NVPTXISD::Suld3DI8Zero; 3662 case Intrinsic::nvvm_suld_3d_i16_zero: 3663 return NVPTXISD::Suld3DI16Zero; 3664 case Intrinsic::nvvm_suld_3d_i32_zero: 3665 return NVPTXISD::Suld3DI32Zero; 3666 case Intrinsic::nvvm_suld_3d_i64_zero: 3667 return NVPTXISD::Suld3DI64Zero; 3668 case Intrinsic::nvvm_suld_3d_v2i8_zero: 3669 return NVPTXISD::Suld3DV2I8Zero; 3670 case Intrinsic::nvvm_suld_3d_v2i16_zero: 3671 return NVPTXISD::Suld3DV2I16Zero; 3672 case Intrinsic::nvvm_suld_3d_v2i32_zero: 3673 return NVPTXISD::Suld3DV2I32Zero; 3674 case Intrinsic::nvvm_suld_3d_v2i64_zero: 3675 return NVPTXISD::Suld3DV2I64Zero; 3676 case Intrinsic::nvvm_suld_3d_v4i8_zero: 3677 return NVPTXISD::Suld3DV4I8Zero; 3678 case Intrinsic::nvvm_suld_3d_v4i16_zero: 3679 return NVPTXISD::Suld3DV4I16Zero; 3680 case Intrinsic::nvvm_suld_3d_v4i32_zero: 3681 return NVPTXISD::Suld3DV4I32Zero; 3682 } 3683 } 3684 3685 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3686 // TgtMemIntrinsic 3687 // because we need the information that is only available in the "Value" type 3688 // of destination 3689 // pointer. In particular, the address space information. 3690 bool NVPTXTargetLowering::getTgtMemIntrinsic( 3691 IntrinsicInfo &Info, const CallInst &I, 3692 MachineFunction &MF, unsigned Intrinsic) const { 3693 switch (Intrinsic) { 3694 default: 3695 return false; 3696 case Intrinsic::nvvm_match_all_sync_i32p: 3697 case Intrinsic::nvvm_match_all_sync_i64p: 3698 Info.opc = ISD::INTRINSIC_W_CHAIN; 3699 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3700 // in order to model data exchange with other threads, but perform no real 3701 // memory accesses. 3702 Info.memVT = MVT::i1; 3703 3704 // Our result depends on both our and other thread's arguments. 3705 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 3706 return true; 3707 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 3708 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 3709 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 3710 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 3711 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 3712 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 3713 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 3714 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 3715 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 3716 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 3717 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 3718 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 3719 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 3720 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 3721 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 3722 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 3723 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 3724 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 3725 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 3726 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 3727 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 3728 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 3729 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 3730 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 3731 Info.opc = ISD::INTRINSIC_W_CHAIN; 3732 Info.memVT = MVT::v8f16; 3733 Info.ptrVal = I.getArgOperand(0); 3734 Info.offset = 0; 3735 Info.flags = MachineMemOperand::MOLoad; 3736 Info.align = Align(16); 3737 return true; 3738 } 3739 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 3740 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 3741 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 3742 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 3743 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 3744 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 3745 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 3746 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 3747 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 3748 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 3749 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 3750 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 3751 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 3752 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 3753 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 3754 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 3755 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 3756 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 3757 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 3758 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 3759 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 3760 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 3761 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 3762 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 3763 Info.opc = ISD::INTRINSIC_W_CHAIN; 3764 Info.memVT = MVT::v2i32; 3765 Info.ptrVal = I.getArgOperand(0); 3766 Info.offset = 0; 3767 Info.flags = MachineMemOperand::MOLoad; 3768 Info.align = Align(8); 3769 return true; 3770 } 3771 3772 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 3773 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 3774 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 3775 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 3776 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 3777 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 3778 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 3779 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 3780 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 3781 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 3782 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 3783 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 3784 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 3785 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 3786 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 3787 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 3788 3789 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 3790 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 3791 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 3792 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 3793 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 3794 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 3795 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 3796 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 3797 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 3798 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 3799 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 3800 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 3801 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 3802 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 3803 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 3804 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 3805 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 3806 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 3807 Info.opc = ISD::INTRINSIC_W_CHAIN; 3808 Info.memVT = MVT::v4i32; 3809 Info.ptrVal = I.getArgOperand(0); 3810 Info.offset = 0; 3811 Info.flags = MachineMemOperand::MOLoad; 3812 Info.align = Align(16); 3813 return true; 3814 } 3815 3816 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 3817 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 3818 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 3819 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 3820 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 3821 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 3822 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 3823 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 3824 3825 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 3826 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 3827 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 3828 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 3829 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 3830 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 3831 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 3832 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 3833 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 3834 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 3835 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 3836 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 3837 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 3838 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 3839 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 3840 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 3841 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 3842 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 3843 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 3844 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 3845 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 3846 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 3847 Info.opc = ISD::INTRINSIC_W_CHAIN; 3848 Info.memVT = MVT::i32; 3849 Info.ptrVal = I.getArgOperand(0); 3850 Info.offset = 0; 3851 Info.flags = MachineMemOperand::MOLoad; 3852 Info.align = Align(4); 3853 return true; 3854 } 3855 3856 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 3857 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 3858 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 3859 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 3860 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 3861 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 3862 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 3863 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 3864 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 3865 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 3866 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 3867 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 3868 Info.opc = ISD::INTRINSIC_W_CHAIN; 3869 Info.memVT = MVT::v4f16; 3870 Info.ptrVal = I.getArgOperand(0); 3871 Info.offset = 0; 3872 Info.flags = MachineMemOperand::MOLoad; 3873 Info.align = Align(16); 3874 return true; 3875 } 3876 3877 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 3878 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 3879 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 3880 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 3881 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 3882 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 3883 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 3884 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 3885 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 3886 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 3887 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 3888 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 3889 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 3890 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 3891 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 3892 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 3893 Info.opc = ISD::INTRINSIC_W_CHAIN; 3894 Info.memVT = MVT::v8f32; 3895 Info.ptrVal = I.getArgOperand(0); 3896 Info.offset = 0; 3897 Info.flags = MachineMemOperand::MOLoad; 3898 Info.align = Align(16); 3899 return true; 3900 } 3901 3902 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 3903 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 3904 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 3905 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 3906 3907 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 3908 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 3909 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 3910 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 3911 3912 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 3913 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 3914 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 3915 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 3916 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 3917 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 3918 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 3919 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 3920 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 3921 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 3922 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 3923 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 3924 Info.opc = ISD::INTRINSIC_W_CHAIN; 3925 Info.memVT = MVT::v8i32; 3926 Info.ptrVal = I.getArgOperand(0); 3927 Info.offset = 0; 3928 Info.flags = MachineMemOperand::MOLoad; 3929 Info.align = Align(16); 3930 return true; 3931 } 3932 3933 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 3934 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 3935 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 3936 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 3937 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 3938 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 3939 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 3940 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 3941 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 3942 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 3943 Info.opc = ISD::INTRINSIC_W_CHAIN; 3944 Info.memVT = MVT::v2i32; 3945 Info.ptrVal = I.getArgOperand(0); 3946 Info.offset = 0; 3947 Info.flags = MachineMemOperand::MOLoad; 3948 Info.align = Align(8); 3949 return true; 3950 } 3951 3952 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 3953 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 3954 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 3955 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 3956 3957 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 3958 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 3959 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 3960 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 3961 Info.opc = ISD::INTRINSIC_W_CHAIN; 3962 Info.memVT = MVT::f64; 3963 Info.ptrVal = I.getArgOperand(0); 3964 Info.offset = 0; 3965 Info.flags = MachineMemOperand::MOLoad; 3966 Info.align = Align(8); 3967 return true; 3968 } 3969 3970 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 3971 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 3972 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 3973 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 3974 Info.opc = ISD::INTRINSIC_W_CHAIN; 3975 Info.memVT = MVT::v2f64; 3976 Info.ptrVal = I.getArgOperand(0); 3977 Info.offset = 0; 3978 Info.flags = MachineMemOperand::MOLoad; 3979 Info.align = Align(16); 3980 return true; 3981 } 3982 3983 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 3984 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 3985 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 3986 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 3987 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 3988 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 3989 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 3990 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 3991 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 3992 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 3993 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 3994 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 3995 Info.opc = ISD::INTRINSIC_VOID; 3996 Info.memVT = MVT::v4f16; 3997 Info.ptrVal = I.getArgOperand(0); 3998 Info.offset = 0; 3999 Info.flags = MachineMemOperand::MOStore; 4000 Info.align = Align(16); 4001 return true; 4002 } 4003 4004 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4005 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4006 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4007 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4008 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4009 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4010 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4011 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4012 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4013 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4014 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4015 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4016 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4017 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4018 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4019 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4020 Info.opc = ISD::INTRINSIC_VOID; 4021 Info.memVT = MVT::v8f32; 4022 Info.ptrVal = I.getArgOperand(0); 4023 Info.offset = 0; 4024 Info.flags = MachineMemOperand::MOStore; 4025 Info.align = Align(16); 4026 return true; 4027 } 4028 4029 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4030 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4031 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4032 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4033 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4034 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4035 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4036 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4037 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4038 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4039 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4040 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4041 Info.opc = ISD::INTRINSIC_VOID; 4042 Info.memVT = MVT::v8i32; 4043 Info.ptrVal = I.getArgOperand(0); 4044 Info.offset = 0; 4045 Info.flags = MachineMemOperand::MOStore; 4046 Info.align = Align(16); 4047 return true; 4048 } 4049 4050 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4051 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4052 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4053 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4054 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4055 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4056 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4057 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4058 Info.opc = ISD::INTRINSIC_VOID; 4059 Info.memVT = MVT::v2i32; 4060 Info.ptrVal = I.getArgOperand(0); 4061 Info.offset = 0; 4062 Info.flags = MachineMemOperand::MOStore; 4063 Info.align = Align(8); 4064 return true; 4065 } 4066 4067 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4068 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4069 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4070 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4071 Info.opc = ISD::INTRINSIC_VOID; 4072 Info.memVT = MVT::v2f64; 4073 Info.ptrVal = I.getArgOperand(0); 4074 Info.offset = 0; 4075 Info.flags = MachineMemOperand::MOStore; 4076 Info.align = Align(16); 4077 return true; 4078 } 4079 4080 case Intrinsic::nvvm_atomic_load_inc_32: 4081 case Intrinsic::nvvm_atomic_load_dec_32: 4082 4083 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4084 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4085 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4086 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4087 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4088 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4089 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4090 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4091 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4092 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4093 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4094 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4095 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4096 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4097 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4098 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4099 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4100 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4101 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4102 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4103 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4104 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4105 auto &DL = I.getModule()->getDataLayout(); 4106 Info.opc = ISD::INTRINSIC_W_CHAIN; 4107 Info.memVT = getValueType(DL, I.getType()); 4108 Info.ptrVal = I.getArgOperand(0); 4109 Info.offset = 0; 4110 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4111 Info.align.reset(); 4112 return true; 4113 } 4114 4115 case Intrinsic::nvvm_ldu_global_i: 4116 case Intrinsic::nvvm_ldu_global_f: 4117 case Intrinsic::nvvm_ldu_global_p: { 4118 auto &DL = I.getModule()->getDataLayout(); 4119 Info.opc = ISD::INTRINSIC_W_CHAIN; 4120 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4121 Info.memVT = getValueType(DL, I.getType()); 4122 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4123 Info.memVT = getPointerTy(DL); 4124 else 4125 Info.memVT = getValueType(DL, I.getType()); 4126 Info.ptrVal = I.getArgOperand(0); 4127 Info.offset = 0; 4128 Info.flags = MachineMemOperand::MOLoad; 4129 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4130 4131 return true; 4132 } 4133 case Intrinsic::nvvm_ldg_global_i: 4134 case Intrinsic::nvvm_ldg_global_f: 4135 case Intrinsic::nvvm_ldg_global_p: { 4136 auto &DL = I.getModule()->getDataLayout(); 4137 4138 Info.opc = ISD::INTRINSIC_W_CHAIN; 4139 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4140 Info.memVT = getValueType(DL, I.getType()); 4141 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4142 Info.memVT = getPointerTy(DL); 4143 else 4144 Info.memVT = getValueType(DL, I.getType()); 4145 Info.ptrVal = I.getArgOperand(0); 4146 Info.offset = 0; 4147 Info.flags = MachineMemOperand::MOLoad; 4148 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4149 4150 return true; 4151 } 4152 4153 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4154 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4155 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4156 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4157 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4158 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4159 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4160 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4161 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4162 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4163 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4164 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4165 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4166 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4167 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4168 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4169 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4170 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4171 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4172 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4173 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4174 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4175 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4176 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4177 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4178 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4179 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4180 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4181 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4182 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4183 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4184 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4185 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4186 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4187 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4188 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4189 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4190 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4191 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4192 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4193 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4194 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4195 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4196 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4197 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4198 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4199 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4200 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4201 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4202 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4203 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4204 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4205 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4206 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4207 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4208 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4209 Info.opc = getOpcForTextureInstr(Intrinsic); 4210 Info.memVT = MVT::v4f32; 4211 Info.ptrVal = nullptr; 4212 Info.offset = 0; 4213 Info.flags = MachineMemOperand::MOLoad; 4214 Info.align = Align(16); 4215 return true; 4216 4217 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4218 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4219 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4220 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4221 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4222 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4223 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4224 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4225 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4226 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4227 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4228 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4229 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4230 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4231 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4232 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4233 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4234 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4235 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4236 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4237 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4238 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4239 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4240 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4241 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4242 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4243 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4244 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4245 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4246 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4247 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4248 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4249 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4250 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4251 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4252 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4253 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4254 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4255 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4256 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4257 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4258 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4259 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4260 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4261 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4262 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4263 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4264 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4265 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4266 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4267 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4268 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4269 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4270 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4271 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4272 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4273 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4274 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4275 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4276 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4277 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4278 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4279 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4280 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4281 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4282 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4283 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4284 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4285 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4286 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4287 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4288 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4289 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4290 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4291 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4292 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4293 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4294 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4295 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4296 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4297 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4298 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4299 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4300 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4301 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4302 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4303 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4304 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4305 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4306 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4307 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4308 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4309 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4310 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4311 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4312 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4313 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4314 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4315 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4316 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4317 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4318 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4319 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4320 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4321 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4322 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4323 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4324 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4325 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4326 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4327 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4328 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4329 Info.opc = getOpcForTextureInstr(Intrinsic); 4330 Info.memVT = MVT::v4i32; 4331 Info.ptrVal = nullptr; 4332 Info.offset = 0; 4333 Info.flags = MachineMemOperand::MOLoad; 4334 Info.align = Align(16); 4335 return true; 4336 4337 case Intrinsic::nvvm_suld_1d_i8_clamp: 4338 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4339 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4340 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4341 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4342 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4343 case Intrinsic::nvvm_suld_2d_i8_clamp: 4344 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4345 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4346 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4347 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4348 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4349 case Intrinsic::nvvm_suld_3d_i8_clamp: 4350 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4351 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4352 case Intrinsic::nvvm_suld_1d_i8_trap: 4353 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4354 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4355 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4356 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4357 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4358 case Intrinsic::nvvm_suld_2d_i8_trap: 4359 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4360 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4361 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4362 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4363 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4364 case Intrinsic::nvvm_suld_3d_i8_trap: 4365 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4366 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4367 case Intrinsic::nvvm_suld_1d_i8_zero: 4368 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4369 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4370 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4371 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4372 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4373 case Intrinsic::nvvm_suld_2d_i8_zero: 4374 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4375 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4376 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4377 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4378 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4379 case Intrinsic::nvvm_suld_3d_i8_zero: 4380 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4381 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4382 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4383 Info.memVT = MVT::i8; 4384 Info.ptrVal = nullptr; 4385 Info.offset = 0; 4386 Info.flags = MachineMemOperand::MOLoad; 4387 Info.align = Align(16); 4388 return true; 4389 4390 case Intrinsic::nvvm_suld_1d_i16_clamp: 4391 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4392 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4393 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4394 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4395 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4396 case Intrinsic::nvvm_suld_2d_i16_clamp: 4397 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4398 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4399 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4400 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4401 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4402 case Intrinsic::nvvm_suld_3d_i16_clamp: 4403 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4404 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4405 case Intrinsic::nvvm_suld_1d_i16_trap: 4406 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4407 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4408 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4409 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4410 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4411 case Intrinsic::nvvm_suld_2d_i16_trap: 4412 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4413 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4414 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4415 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4416 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4417 case Intrinsic::nvvm_suld_3d_i16_trap: 4418 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4419 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4420 case Intrinsic::nvvm_suld_1d_i16_zero: 4421 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4422 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4423 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4424 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4425 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4426 case Intrinsic::nvvm_suld_2d_i16_zero: 4427 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4428 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4429 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4430 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4431 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4432 case Intrinsic::nvvm_suld_3d_i16_zero: 4433 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4434 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4435 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4436 Info.memVT = MVT::i16; 4437 Info.ptrVal = nullptr; 4438 Info.offset = 0; 4439 Info.flags = MachineMemOperand::MOLoad; 4440 Info.align = Align(16); 4441 return true; 4442 4443 case Intrinsic::nvvm_suld_1d_i32_clamp: 4444 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4445 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4446 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4447 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4448 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4449 case Intrinsic::nvvm_suld_2d_i32_clamp: 4450 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 4451 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 4452 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 4453 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 4454 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 4455 case Intrinsic::nvvm_suld_3d_i32_clamp: 4456 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4457 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4458 case Intrinsic::nvvm_suld_1d_i32_trap: 4459 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4460 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4461 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4462 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4463 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4464 case Intrinsic::nvvm_suld_2d_i32_trap: 4465 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4466 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4467 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4468 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4469 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4470 case Intrinsic::nvvm_suld_3d_i32_trap: 4471 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4472 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4473 case Intrinsic::nvvm_suld_1d_i32_zero: 4474 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4475 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4476 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4477 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4478 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4479 case Intrinsic::nvvm_suld_2d_i32_zero: 4480 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4481 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4482 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4483 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4484 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4485 case Intrinsic::nvvm_suld_3d_i32_zero: 4486 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4487 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4488 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4489 Info.memVT = MVT::i32; 4490 Info.ptrVal = nullptr; 4491 Info.offset = 0; 4492 Info.flags = MachineMemOperand::MOLoad; 4493 Info.align = Align(16); 4494 return true; 4495 4496 case Intrinsic::nvvm_suld_1d_i64_clamp: 4497 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 4498 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 4499 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 4500 case Intrinsic::nvvm_suld_2d_i64_clamp: 4501 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 4502 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 4503 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 4504 case Intrinsic::nvvm_suld_3d_i64_clamp: 4505 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4506 case Intrinsic::nvvm_suld_1d_i64_trap: 4507 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4508 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4509 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4510 case Intrinsic::nvvm_suld_2d_i64_trap: 4511 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4512 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4513 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4514 case Intrinsic::nvvm_suld_3d_i64_trap: 4515 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4516 case Intrinsic::nvvm_suld_1d_i64_zero: 4517 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4518 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4519 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4520 case Intrinsic::nvvm_suld_2d_i64_zero: 4521 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4522 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4523 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4524 case Intrinsic::nvvm_suld_3d_i64_zero: 4525 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4526 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4527 Info.memVT = MVT::i64; 4528 Info.ptrVal = nullptr; 4529 Info.offset = 0; 4530 Info.flags = MachineMemOperand::MOLoad; 4531 Info.align = Align(16); 4532 return true; 4533 } 4534 return false; 4535 } 4536 4537 /// getFunctionParamOptimizedAlign - since function arguments are passed via 4538 /// .param space, we may want to increase their alignment in a way that 4539 /// ensures that we can effectively vectorize their loads & stores. We can 4540 /// increase alignment only if the function has internal or has private 4541 /// linkage as for other linkage types callers may already rely on default 4542 /// alignment. To allow using 128-bit vectorized loads/stores, this function 4543 /// ensures that alignment is 16 or greater. 4544 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 4545 const Function *F, Type *ArgTy, const DataLayout &DL) const { 4546 const uint64_t ABITypeAlign = DL.getABITypeAlign(ArgTy).value(); 4547 4548 // If a function has linkage different from internal or private, we 4549 // must use default ABI alignment as external users rely on it. Same 4550 // for a function that may be called from a function pointer. 4551 if (!F || !F->hasLocalLinkage() || 4552 F->hasAddressTaken(/*Users=*/nullptr, 4553 /*IgnoreCallbackUses=*/false, 4554 /*IgnoreAssumeLikeCalls=*/true, 4555 /*IgnoreLLVMUsed=*/true)) 4556 return Align(ABITypeAlign); 4557 4558 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 4559 return Align(std::max(uint64_t(16), ABITypeAlign)); 4560 } 4561 4562 /// Helper for computing alignment of a device function byval parameter. 4563 Align NVPTXTargetLowering::getFunctionByValParamAlign( 4564 const Function *F, Type *ArgTy, Align InitialAlign, 4565 const DataLayout &DL) const { 4566 Align ArgAlign = InitialAlign; 4567 // Try to increase alignment to enhance vectorization options. 4568 if (F) 4569 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 4570 4571 // Old ptx versions have a bug. When PTX code takes address of 4572 // byval parameter with alignment < 4, ptxas generates code to 4573 // spill argument into memory. Alas on sm_50+ ptxas generates 4574 // SASS code that fails with misaligned access. To work around 4575 // the problem, make sure that we align byval parameters by at 4576 // least 4. This bug seems to be fixed at least starting from 4577 // ptxas > 9.0. 4578 // TODO: remove this after verifying the bug is not reproduced 4579 // on non-deprecated ptxas versions. 4580 if (ForceMinByValParamAlign) 4581 ArgAlign = std::max(ArgAlign, Align(4)); 4582 4583 return ArgAlign; 4584 } 4585 4586 // Helper for getting a function parameter name. Name is composed from 4587 // its index and the function name. Negative index corresponds to special 4588 // parameter (unsized array) used for passing variable arguments. 4589 std::string NVPTXTargetLowering::getParamName(const Function *F, 4590 int Idx) const { 4591 std::string ParamName; 4592 raw_string_ostream ParamStr(ParamName); 4593 4594 ParamStr << getTargetMachine().getSymbol(F)->getName(); 4595 if (Idx < 0) 4596 ParamStr << "_vararg"; 4597 else 4598 ParamStr << "_param_" << Idx; 4599 4600 return ParamName; 4601 } 4602 4603 /// isLegalAddressingMode - Return true if the addressing mode represented 4604 /// by AM is legal for this target, for a load/store of the specified type. 4605 /// Used to guide target specific optimizations, like loop strength reduction 4606 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 4607 /// (CodeGenPrepare.cpp) 4608 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 4609 const AddrMode &AM, Type *Ty, 4610 unsigned AS, Instruction *I) const { 4611 // AddrMode - This represents an addressing mode of: 4612 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4613 // 4614 // The legal address modes are 4615 // - [avar] 4616 // - [areg] 4617 // - [areg+immoff] 4618 // - [immAddr] 4619 4620 if (AM.BaseGV) { 4621 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 4622 } 4623 4624 switch (AM.Scale) { 4625 case 0: // "r", "r+i" or "i" is allowed 4626 break; 4627 case 1: 4628 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 4629 return false; 4630 // Otherwise we have r+i. 4631 break; 4632 default: 4633 // No scale > 1 is allowed 4634 return false; 4635 } 4636 return true; 4637 } 4638 4639 //===----------------------------------------------------------------------===// 4640 // NVPTX Inline Assembly Support 4641 //===----------------------------------------------------------------------===// 4642 4643 /// getConstraintType - Given a constraint letter, return the type of 4644 /// constraint it is for this target. 4645 NVPTXTargetLowering::ConstraintType 4646 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 4647 if (Constraint.size() == 1) { 4648 switch (Constraint[0]) { 4649 default: 4650 break; 4651 case 'b': 4652 case 'r': 4653 case 'h': 4654 case 'c': 4655 case 'l': 4656 case 'f': 4657 case 'd': 4658 case '0': 4659 case 'N': 4660 return C_RegisterClass; 4661 } 4662 } 4663 return TargetLowering::getConstraintType(Constraint); 4664 } 4665 4666 std::pair<unsigned, const TargetRegisterClass *> 4667 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 4668 StringRef Constraint, 4669 MVT VT) const { 4670 if (Constraint.size() == 1) { 4671 switch (Constraint[0]) { 4672 case 'b': 4673 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 4674 case 'c': 4675 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4676 case 'h': 4677 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 4678 case 'r': 4679 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 4680 case 'l': 4681 case 'N': 4682 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 4683 case 'f': 4684 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 4685 case 'd': 4686 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 4687 } 4688 } 4689 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 4690 } 4691 4692 //===----------------------------------------------------------------------===// 4693 // NVPTX DAG Combining 4694 //===----------------------------------------------------------------------===// 4695 4696 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 4697 CodeGenOpt::Level OptLevel) const { 4698 // Always honor command-line argument 4699 if (FMAContractLevelOpt.getNumOccurrences() > 0) 4700 return FMAContractLevelOpt > 0; 4701 4702 // Do not contract if we're not optimizing the code. 4703 if (OptLevel == 0) 4704 return false; 4705 4706 // Honor TargetOptions flags that explicitly say fusion is okay. 4707 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 4708 return true; 4709 4710 return allowUnsafeFPMath(MF); 4711 } 4712 4713 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 4714 // Honor TargetOptions flags that explicitly say unsafe math is okay. 4715 if (MF.getTarget().Options.UnsafeFPMath) 4716 return true; 4717 4718 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4719 const Function &F = MF.getFunction(); 4720 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 4721 } 4722 4723 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4724 /// operands N0 and N1. This is a helper for PerformADDCombine that is 4725 /// called with the default operands, and if that fails, with commuted 4726 /// operands. 4727 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 4728 TargetLowering::DAGCombinerInfo &DCI, 4729 const NVPTXSubtarget &Subtarget, 4730 CodeGenOpt::Level OptLevel) { 4731 SelectionDAG &DAG = DCI.DAG; 4732 // Skip non-integer, non-scalar case 4733 EVT VT=N0.getValueType(); 4734 if (VT.isVector()) 4735 return SDValue(); 4736 4737 // fold (add (mul a, b), c) -> (mad a, b, c) 4738 // 4739 if (N0.getOpcode() == ISD::MUL) { 4740 assert (VT.isInteger()); 4741 // For integer: 4742 // Since integer multiply-add costs the same as integer multiply 4743 // but is more costly than integer add, do the fusion only when 4744 // the mul is only used in the add. 4745 if (OptLevel==CodeGenOpt::None || VT != MVT::i32 || 4746 !N0.getNode()->hasOneUse()) 4747 return SDValue(); 4748 4749 // Do the folding 4750 return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 4751 N0.getOperand(0), N0.getOperand(1), N1); 4752 } 4753 else if (N0.getOpcode() == ISD::FMUL) { 4754 if (VT == MVT::f32 || VT == MVT::f64) { 4755 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 4756 &DAG.getTargetLoweringInfo()); 4757 if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel)) 4758 return SDValue(); 4759 4760 // For floating point: 4761 // Do the fusion only when the mul has less than 5 uses and all 4762 // are add. 4763 // The heuristic is that if a use is not an add, then that use 4764 // cannot be fused into fma, therefore mul is still needed anyway. 4765 // If there are more than 4 uses, even if they are all add, fusing 4766 // them will increase register pressue. 4767 // 4768 int numUses = 0; 4769 int nonAddCount = 0; 4770 for (const SDNode *User : N0.getNode()->uses()) { 4771 numUses++; 4772 if (User->getOpcode() != ISD::FADD) 4773 ++nonAddCount; 4774 } 4775 if (numUses >= 5) 4776 return SDValue(); 4777 if (nonAddCount) { 4778 int orderNo = N->getIROrder(); 4779 int orderNo2 = N0.getNode()->getIROrder(); 4780 // simple heuristics here for considering potential register 4781 // pressure, the logics here is that the differnce are used 4782 // to measure the distance between def and use, the longer distance 4783 // more likely cause register pressure. 4784 if (orderNo - orderNo2 < 500) 4785 return SDValue(); 4786 4787 // Now, check if at least one of the FMUL's operands is live beyond the node N, 4788 // which guarantees that the FMA will not increase register pressure at node N. 4789 bool opIsLive = false; 4790 const SDNode *left = N0.getOperand(0).getNode(); 4791 const SDNode *right = N0.getOperand(1).getNode(); 4792 4793 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 4794 opIsLive = true; 4795 4796 if (!opIsLive) 4797 for (const SDNode *User : left->uses()) { 4798 int orderNo3 = User->getIROrder(); 4799 if (orderNo3 > orderNo) { 4800 opIsLive = true; 4801 break; 4802 } 4803 } 4804 4805 if (!opIsLive) 4806 for (const SDNode *User : right->uses()) { 4807 int orderNo3 = User->getIROrder(); 4808 if (orderNo3 > orderNo) { 4809 opIsLive = true; 4810 break; 4811 } 4812 } 4813 4814 if (!opIsLive) 4815 return SDValue(); 4816 } 4817 4818 return DAG.getNode(ISD::FMA, SDLoc(N), VT, 4819 N0.getOperand(0), N0.getOperand(1), N1); 4820 } 4821 } 4822 4823 return SDValue(); 4824 } 4825 4826 static SDValue PerformStoreRetvalCombine(SDNode *N) { 4827 // Operands from the 2nd to the last one are the values to be stored 4828 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) 4829 if (!N->getOperand(I).isUndef()) 4830 return SDValue(); 4831 4832 // Operand 0 is the previous value in the chain. Cannot return EntryToken 4833 // as the previous value will become unused and eliminated later. 4834 return N->getOperand(0); 4835 } 4836 4837 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4838 /// 4839 static SDValue PerformADDCombine(SDNode *N, 4840 TargetLowering::DAGCombinerInfo &DCI, 4841 const NVPTXSubtarget &Subtarget, 4842 CodeGenOpt::Level OptLevel) { 4843 SDValue N0 = N->getOperand(0); 4844 SDValue N1 = N->getOperand(1); 4845 4846 // First try with the default operand order. 4847 if (SDValue Result = 4848 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) 4849 return Result; 4850 4851 // If that didn't work, try again with the operands commuted. 4852 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel); 4853 } 4854 4855 static SDValue PerformANDCombine(SDNode *N, 4856 TargetLowering::DAGCombinerInfo &DCI) { 4857 // The type legalizer turns a vector load of i8 values into a zextload to i16 4858 // registers, optionally ANY_EXTENDs it (if target type is integer), 4859 // and ANDs off the high 8 bits. Since we turn this load into a 4860 // target-specific DAG node, the DAG combiner fails to eliminate these AND 4861 // nodes. Do that here. 4862 SDValue Val = N->getOperand(0); 4863 SDValue Mask = N->getOperand(1); 4864 4865 if (isa<ConstantSDNode>(Val)) { 4866 std::swap(Val, Mask); 4867 } 4868 4869 SDValue AExt; 4870 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4871 if (Val.getOpcode() == ISD::ANY_EXTEND) { 4872 AExt = Val; 4873 Val = Val->getOperand(0); 4874 } 4875 4876 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 4877 Val = Val->getOperand(0); 4878 } 4879 4880 if (Val->getOpcode() == NVPTXISD::LoadV2 || 4881 Val->getOpcode() == NVPTXISD::LoadV4) { 4882 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 4883 if (!MaskCnst) { 4884 // Not an AND with a constant 4885 return SDValue(); 4886 } 4887 4888 uint64_t MaskVal = MaskCnst->getZExtValue(); 4889 if (MaskVal != 0xff) { 4890 // Not an AND that chops off top 8 bits 4891 return SDValue(); 4892 } 4893 4894 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 4895 if (!Mem) { 4896 // Not a MemSDNode?!? 4897 return SDValue(); 4898 } 4899 4900 EVT MemVT = Mem->getMemoryVT(); 4901 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 4902 // We only handle the i8 case 4903 return SDValue(); 4904 } 4905 4906 unsigned ExtType = 4907 cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))-> 4908 getZExtValue(); 4909 if (ExtType == ISD::SEXTLOAD) { 4910 // If for some reason the load is a sextload, the and is needed to zero 4911 // out the high 8 bits 4912 return SDValue(); 4913 } 4914 4915 bool AddTo = false; 4916 if (AExt.getNode() != nullptr) { 4917 // Re-insert the ext as a zext. 4918 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 4919 AExt.getValueType(), Val); 4920 AddTo = true; 4921 } 4922 4923 // If we get here, the AND is unnecessary. Just replace it with the load 4924 DCI.CombineTo(N, Val, AddTo); 4925 } 4926 4927 return SDValue(); 4928 } 4929 4930 static SDValue PerformREMCombine(SDNode *N, 4931 TargetLowering::DAGCombinerInfo &DCI, 4932 CodeGenOpt::Level OptLevel) { 4933 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 4934 4935 // Don't do anything at less than -O2. 4936 if (OptLevel < CodeGenOpt::Default) 4937 return SDValue(); 4938 4939 SelectionDAG &DAG = DCI.DAG; 4940 SDLoc DL(N); 4941 EVT VT = N->getValueType(0); 4942 bool IsSigned = N->getOpcode() == ISD::SREM; 4943 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 4944 4945 const SDValue &Num = N->getOperand(0); 4946 const SDValue &Den = N->getOperand(1); 4947 4948 for (const SDNode *U : Num->uses()) { 4949 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 4950 U->getOperand(1) == Den) { 4951 // Num % Den -> Num - (Num / Den) * Den 4952 return DAG.getNode(ISD::SUB, DL, VT, Num, 4953 DAG.getNode(ISD::MUL, DL, VT, 4954 DAG.getNode(DivOpc, DL, VT, Num, Den), 4955 Den)); 4956 } 4957 } 4958 return SDValue(); 4959 } 4960 4961 enum OperandSignedness { 4962 Signed = 0, 4963 Unsigned, 4964 Unknown 4965 }; 4966 4967 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4968 /// that can be demoted to \p OptSize bits without loss of information. The 4969 /// signedness of the operand, if determinable, is placed in \p S. 4970 static bool IsMulWideOperandDemotable(SDValue Op, 4971 unsigned OptSize, 4972 OperandSignedness &S) { 4973 S = Unknown; 4974 4975 if (Op.getOpcode() == ISD::SIGN_EXTEND || 4976 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 4977 EVT OrigVT = Op.getOperand(0).getValueType(); 4978 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4979 S = Signed; 4980 return true; 4981 } 4982 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 4983 EVT OrigVT = Op.getOperand(0).getValueType(); 4984 if (OrigVT.getFixedSizeInBits() <= OptSize) { 4985 S = Unsigned; 4986 return true; 4987 } 4988 } 4989 4990 return false; 4991 } 4992 4993 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4994 /// be demoted to \p OptSize bits without loss of information. If the operands 4995 /// contain a constant, it should appear as the RHS operand. The signedness of 4996 /// the operands is placed in \p IsSigned. 4997 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 4998 unsigned OptSize, 4999 bool &IsSigned) { 5000 OperandSignedness LHSSign; 5001 5002 // The LHS operand must be a demotable op 5003 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5004 return false; 5005 5006 // We should have been able to determine the signedness from the LHS 5007 if (LHSSign == Unknown) 5008 return false; 5009 5010 IsSigned = (LHSSign == Signed); 5011 5012 // The RHS can be a demotable op or a constant 5013 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5014 const APInt &Val = CI->getAPIntValue(); 5015 if (LHSSign == Unsigned) { 5016 return Val.isIntN(OptSize); 5017 } else { 5018 return Val.isSignedIntN(OptSize); 5019 } 5020 } else { 5021 OperandSignedness RHSSign; 5022 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5023 return false; 5024 5025 return LHSSign == RHSSign; 5026 } 5027 } 5028 5029 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5030 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5031 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5032 /// amount. 5033 static SDValue TryMULWIDECombine(SDNode *N, 5034 TargetLowering::DAGCombinerInfo &DCI) { 5035 EVT MulType = N->getValueType(0); 5036 if (MulType != MVT::i32 && MulType != MVT::i64) { 5037 return SDValue(); 5038 } 5039 5040 SDLoc DL(N); 5041 unsigned OptSize = MulType.getSizeInBits() >> 1; 5042 SDValue LHS = N->getOperand(0); 5043 SDValue RHS = N->getOperand(1); 5044 5045 // Canonicalize the multiply so the constant (if any) is on the right 5046 if (N->getOpcode() == ISD::MUL) { 5047 if (isa<ConstantSDNode>(LHS)) { 5048 std::swap(LHS, RHS); 5049 } 5050 } 5051 5052 // If we have a SHL, determine the actual multiply amount 5053 if (N->getOpcode() == ISD::SHL) { 5054 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5055 if (!ShlRHS) { 5056 return SDValue(); 5057 } 5058 5059 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5060 unsigned BitWidth = MulType.getSizeInBits(); 5061 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5062 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5063 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5064 } else { 5065 return SDValue(); 5066 } 5067 } 5068 5069 bool Signed; 5070 // Verify that our operands are demotable 5071 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5072 return SDValue(); 5073 } 5074 5075 EVT DemotedVT; 5076 if (MulType == MVT::i32) { 5077 DemotedVT = MVT::i16; 5078 } else { 5079 DemotedVT = MVT::i32; 5080 } 5081 5082 // Truncate the operands to the correct size. Note that these are just for 5083 // type consistency and will (likely) be eliminated in later phases. 5084 SDValue TruncLHS = 5085 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5086 SDValue TruncRHS = 5087 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5088 5089 unsigned Opc; 5090 if (Signed) { 5091 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5092 } else { 5093 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5094 } 5095 5096 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5097 } 5098 5099 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5100 static SDValue PerformMULCombine(SDNode *N, 5101 TargetLowering::DAGCombinerInfo &DCI, 5102 CodeGenOpt::Level OptLevel) { 5103 if (OptLevel > 0) { 5104 // Try mul.wide combining at OptLevel > 0 5105 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5106 return Ret; 5107 } 5108 5109 return SDValue(); 5110 } 5111 5112 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5113 static SDValue PerformSHLCombine(SDNode *N, 5114 TargetLowering::DAGCombinerInfo &DCI, 5115 CodeGenOpt::Level OptLevel) { 5116 if (OptLevel > 0) { 5117 // Try mul.wide combining at OptLevel > 0 5118 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5119 return Ret; 5120 } 5121 5122 return SDValue(); 5123 } 5124 5125 static SDValue PerformSETCCCombine(SDNode *N, 5126 TargetLowering::DAGCombinerInfo &DCI) { 5127 EVT CCType = N->getValueType(0); 5128 SDValue A = N->getOperand(0); 5129 SDValue B = N->getOperand(1); 5130 5131 if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16) 5132 return SDValue(); 5133 5134 SDLoc DL(N); 5135 // setp.f16x2 returns two scalar predicates, which we need to 5136 // convert back to v2i1. The returned result will be scalarized by 5137 // the legalizer, but the comparison will remain a single vector 5138 // instruction. 5139 SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL, 5140 DCI.DAG.getVTList(MVT::i1, MVT::i1), 5141 {A, B, N->getOperand(2)}); 5142 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5143 CCNode.getValue(1)); 5144 } 5145 5146 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 5147 DAGCombinerInfo &DCI) const { 5148 CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel(); 5149 switch (N->getOpcode()) { 5150 default: break; 5151 case ISD::ADD: 5152 case ISD::FADD: 5153 return PerformADDCombine(N, DCI, STI, OptLevel); 5154 case ISD::MUL: 5155 return PerformMULCombine(N, DCI, OptLevel); 5156 case ISD::SHL: 5157 return PerformSHLCombine(N, DCI, OptLevel); 5158 case ISD::AND: 5159 return PerformANDCombine(N, DCI); 5160 case ISD::UREM: 5161 case ISD::SREM: 5162 return PerformREMCombine(N, DCI, OptLevel); 5163 case ISD::SETCC: 5164 return PerformSETCCCombine(N, DCI); 5165 case NVPTXISD::StoreRetval: 5166 case NVPTXISD::StoreRetvalV2: 5167 case NVPTXISD::StoreRetvalV4: 5168 return PerformStoreRetvalCombine(N); 5169 } 5170 return SDValue(); 5171 } 5172 5173 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5174 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 5175 SmallVectorImpl<SDValue> &Results) { 5176 EVT ResVT = N->getValueType(0); 5177 SDLoc DL(N); 5178 5179 assert(ResVT.isVector() && "Vector load must have vector type"); 5180 5181 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 5182 // legal. We can (and should) split that into 2 loads of <2 x double> here 5183 // but I'm leaving that as a TODO for now. 5184 assert(ResVT.isSimple() && "Can only handle simple types"); 5185 switch (ResVT.getSimpleVT().SimpleTy) { 5186 default: 5187 return; 5188 case MVT::v2i8: 5189 case MVT::v2i16: 5190 case MVT::v2i32: 5191 case MVT::v2i64: 5192 case MVT::v2f16: 5193 case MVT::v2f32: 5194 case MVT::v2f64: 5195 case MVT::v4i8: 5196 case MVT::v4i16: 5197 case MVT::v4i32: 5198 case MVT::v4f16: 5199 case MVT::v4f32: 5200 case MVT::v8f16: // <4 x f16x2> 5201 // This is a "native" vector type 5202 break; 5203 } 5204 5205 LoadSDNode *LD = cast<LoadSDNode>(N); 5206 5207 Align Alignment = LD->getAlign(); 5208 auto &TD = DAG.getDataLayout(); 5209 Align PrefAlign = 5210 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 5211 if (Alignment < PrefAlign) { 5212 // This load is not sufficiently aligned, so bail out and let this vector 5213 // load be scalarized. Note that we may still be able to emit smaller 5214 // vector loads. For example, if we are loading a <4 x float> with an 5215 // alignment of 8, this check will fail but the legalizer will try again 5216 // with 2 x <2 x float>, which will succeed with an alignment of 8. 5217 return; 5218 } 5219 5220 EVT EltVT = ResVT.getVectorElementType(); 5221 unsigned NumElts = ResVT.getVectorNumElements(); 5222 5223 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5224 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5225 // loaded type to i16 and propagate the "real" type as the memory type. 5226 bool NeedTrunc = false; 5227 if (EltVT.getSizeInBits() < 16) { 5228 EltVT = MVT::i16; 5229 NeedTrunc = true; 5230 } 5231 5232 unsigned Opcode = 0; 5233 SDVTList LdResVTs; 5234 bool LoadF16x2 = false; 5235 5236 switch (NumElts) { 5237 default: 5238 return; 5239 case 2: 5240 Opcode = NVPTXISD::LoadV2; 5241 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5242 break; 5243 case 4: { 5244 Opcode = NVPTXISD::LoadV4; 5245 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5246 LdResVTs = DAG.getVTList(ListVTs); 5247 break; 5248 } 5249 case 8: { 5250 // v8f16 is a special case. PTX doesn't have ld.v8.f16 5251 // instruction. Instead, we split the vector into v2f16 chunks and 5252 // load them with ld.v4.b32. 5253 assert(Isf16Orbf16Type(EltVT.getSimpleVT()) && 5254 "Unsupported v8 vector type."); 5255 LoadF16x2 = true; 5256 Opcode = NVPTXISD::LoadV4; 5257 EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16; 5258 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 5259 LdResVTs = DAG.getVTList(ListVTs); 5260 break; 5261 } 5262 } 5263 5264 // Copy regular operands 5265 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 5266 5267 // The select routine does not have access to the LoadSDNode instance, so 5268 // pass along the extension information 5269 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5270 5271 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5272 LD->getMemoryVT(), 5273 LD->getMemOperand()); 5274 5275 SmallVector<SDValue, 8> ScalarRes; 5276 if (LoadF16x2) { 5277 // Split v2f16 subvectors back into individual elements. 5278 NumElts /= 2; 5279 for (unsigned i = 0; i < NumElts; ++i) { 5280 SDValue SubVector = NewLD.getValue(i); 5281 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5282 DAG.getIntPtrConstant(0, DL)); 5283 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 5284 DAG.getIntPtrConstant(1, DL)); 5285 ScalarRes.push_back(E0); 5286 ScalarRes.push_back(E1); 5287 } 5288 } else { 5289 for (unsigned i = 0; i < NumElts; ++i) { 5290 SDValue Res = NewLD.getValue(i); 5291 if (NeedTrunc) 5292 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5293 ScalarRes.push_back(Res); 5294 } 5295 } 5296 5297 SDValue LoadChain = NewLD.getValue(NumElts); 5298 5299 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 5300 5301 Results.push_back(BuildVec); 5302 Results.push_back(LoadChain); 5303 } 5304 5305 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 5306 SmallVectorImpl<SDValue> &Results) { 5307 SDValue Chain = N->getOperand(0); 5308 SDValue Intrin = N->getOperand(1); 5309 SDLoc DL(N); 5310 5311 // Get the intrinsic ID 5312 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); 5313 switch (IntrinNo) { 5314 default: 5315 return; 5316 case Intrinsic::nvvm_ldg_global_i: 5317 case Intrinsic::nvvm_ldg_global_f: 5318 case Intrinsic::nvvm_ldg_global_p: 5319 case Intrinsic::nvvm_ldu_global_i: 5320 case Intrinsic::nvvm_ldu_global_f: 5321 case Intrinsic::nvvm_ldu_global_p: { 5322 EVT ResVT = N->getValueType(0); 5323 5324 if (ResVT.isVector()) { 5325 // Vector LDG/LDU 5326 5327 unsigned NumElts = ResVT.getVectorNumElements(); 5328 EVT EltVT = ResVT.getVectorElementType(); 5329 5330 // Since LDU/LDG are target nodes, we cannot rely on DAG type 5331 // legalization. 5332 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 5333 // loaded type to i16 and propagate the "real" type as the memory type. 5334 bool NeedTrunc = false; 5335 if (EltVT.getSizeInBits() < 16) { 5336 EltVT = MVT::i16; 5337 NeedTrunc = true; 5338 } 5339 5340 unsigned Opcode = 0; 5341 SDVTList LdResVTs; 5342 5343 switch (NumElts) { 5344 default: 5345 return; 5346 case 2: 5347 switch (IntrinNo) { 5348 default: 5349 return; 5350 case Intrinsic::nvvm_ldg_global_i: 5351 case Intrinsic::nvvm_ldg_global_f: 5352 case Intrinsic::nvvm_ldg_global_p: 5353 Opcode = NVPTXISD::LDGV2; 5354 break; 5355 case Intrinsic::nvvm_ldu_global_i: 5356 case Intrinsic::nvvm_ldu_global_f: 5357 case Intrinsic::nvvm_ldu_global_p: 5358 Opcode = NVPTXISD::LDUV2; 5359 break; 5360 } 5361 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 5362 break; 5363 case 4: { 5364 switch (IntrinNo) { 5365 default: 5366 return; 5367 case Intrinsic::nvvm_ldg_global_i: 5368 case Intrinsic::nvvm_ldg_global_f: 5369 case Intrinsic::nvvm_ldg_global_p: 5370 Opcode = NVPTXISD::LDGV4; 5371 break; 5372 case Intrinsic::nvvm_ldu_global_i: 5373 case Intrinsic::nvvm_ldu_global_f: 5374 case Intrinsic::nvvm_ldu_global_p: 5375 Opcode = NVPTXISD::LDUV4; 5376 break; 5377 } 5378 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 5379 LdResVTs = DAG.getVTList(ListVTs); 5380 break; 5381 } 5382 } 5383 5384 SmallVector<SDValue, 8> OtherOps; 5385 5386 // Copy regular operands 5387 5388 OtherOps.push_back(Chain); // Chain 5389 // Skip operand 1 (intrinsic ID) 5390 // Others 5391 OtherOps.append(N->op_begin() + 2, N->op_end()); 5392 5393 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5394 5395 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 5396 MemSD->getMemoryVT(), 5397 MemSD->getMemOperand()); 5398 5399 SmallVector<SDValue, 4> ScalarRes; 5400 5401 for (unsigned i = 0; i < NumElts; ++i) { 5402 SDValue Res = NewLD.getValue(i); 5403 if (NeedTrunc) 5404 Res = 5405 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 5406 ScalarRes.push_back(Res); 5407 } 5408 5409 SDValue LoadChain = NewLD.getValue(NumElts); 5410 5411 SDValue BuildVec = 5412 DAG.getBuildVector(ResVT, DL, ScalarRes); 5413 5414 Results.push_back(BuildVec); 5415 Results.push_back(LoadChain); 5416 } else { 5417 // i8 LDG/LDU 5418 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 5419 "Custom handling of non-i8 ldu/ldg?"); 5420 5421 // Just copy all operands as-is 5422 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 5423 5424 // Force output to i16 5425 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 5426 5427 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 5428 5429 // We make sure the memory type is i8, which will be used during isel 5430 // to select the proper instruction. 5431 SDValue NewLD = 5432 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 5433 MVT::i8, MemSD->getMemOperand()); 5434 5435 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 5436 NewLD.getValue(0))); 5437 Results.push_back(NewLD.getValue(1)); 5438 } 5439 } 5440 } 5441 } 5442 5443 void NVPTXTargetLowering::ReplaceNodeResults( 5444 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 5445 switch (N->getOpcode()) { 5446 default: 5447 report_fatal_error("Unhandled custom legalization"); 5448 case ISD::LOAD: 5449 ReplaceLoadVector(N, DAG, Results); 5450 return; 5451 case ISD::INTRINSIC_W_CHAIN: 5452 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 5453 return; 5454 } 5455 } 5456 5457 NVPTXTargetLowering::AtomicExpansionKind 5458 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 5459 Type *Ty = AI->getValOperand()->getType(); 5460 5461 if (AI->isFloatingPointOperation()) { 5462 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 5463 if (Ty->isFloatTy()) 5464 return AtomicExpansionKind::None; 5465 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 5466 return AtomicExpansionKind::None; 5467 } 5468 return AtomicExpansionKind::CmpXChg; 5469 } 5470 5471 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 5472 auto ITy = cast<llvm::IntegerType>(Ty); 5473 5474 switch (AI->getOperation()) { 5475 default: 5476 return AtomicExpansionKind::CmpXChg; 5477 case AtomicRMWInst::BinOp::And: 5478 case AtomicRMWInst::BinOp::Or: 5479 case AtomicRMWInst::BinOp::Xor: 5480 case AtomicRMWInst::BinOp::Xchg: 5481 switch (ITy->getBitWidth()) { 5482 case 8: 5483 case 16: 5484 return AtomicExpansionKind::CmpXChg; 5485 case 32: 5486 return AtomicExpansionKind::None; 5487 case 64: 5488 if (STI.hasAtomBitwise64()) 5489 return AtomicExpansionKind::None; 5490 return AtomicExpansionKind::CmpXChg; 5491 default: 5492 llvm_unreachable("unsupported width encountered"); 5493 } 5494 case AtomicRMWInst::BinOp::Add: 5495 case AtomicRMWInst::BinOp::Sub: 5496 case AtomicRMWInst::BinOp::Max: 5497 case AtomicRMWInst::BinOp::Min: 5498 case AtomicRMWInst::BinOp::UMax: 5499 case AtomicRMWInst::BinOp::UMin: 5500 switch (ITy->getBitWidth()) { 5501 case 8: 5502 case 16: 5503 return AtomicExpansionKind::CmpXChg; 5504 case 32: 5505 return AtomicExpansionKind::None; 5506 case 64: 5507 if (STI.hasAtomMinMax64()) 5508 return AtomicExpansionKind::None; 5509 return AtomicExpansionKind::CmpXChg; 5510 default: 5511 llvm_unreachable("unsupported width encountered"); 5512 } 5513 } 5514 5515 return AtomicExpansionKind::CmpXChg; 5516 } 5517 5518 // Pin NVPTXTargetObjectFile's vtables to this file. 5519 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 5520 5521 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 5522 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 5523 return getDataSection(); 5524 } 5525