1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/ISDOpcodes.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineMemOperand.h" 29 #include "llvm/CodeGen/SelectionDAG.h" 30 #include "llvm/CodeGen/SelectionDAGNodes.h" 31 #include "llvm/CodeGen/TargetCallingConv.h" 32 #include "llvm/CodeGen/TargetLowering.h" 33 #include "llvm/CodeGen/ValueTypes.h" 34 #include "llvm/CodeGenTypes/MachineValueType.h" 35 #include "llvm/IR/Argument.h" 36 #include "llvm/IR/Attributes.h" 37 #include "llvm/IR/Constants.h" 38 #include "llvm/IR/DataLayout.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/DiagnosticInfo.h" 41 #include "llvm/IR/FPEnv.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalValue.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsNVPTX.h" 47 #include "llvm/IR/Module.h" 48 #include "llvm/IR/Type.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/Support/Alignment.h" 51 #include "llvm/Support/Casting.h" 52 #include "llvm/Support/CodeGen.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/ErrorHandling.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Target/TargetMachine.h" 57 #include "llvm/Target/TargetOptions.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <cmath> 61 #include <cstdint> 62 #include <iterator> 63 #include <optional> 64 #include <sstream> 65 #include <string> 66 #include <utility> 67 #include <vector> 68 69 #define DEBUG_TYPE "nvptx-lower" 70 71 using namespace llvm; 72 73 static std::atomic<unsigned> GlobalUniqueCallSite; 74 75 static cl::opt<bool> sched4reg( 76 "nvptx-sched4reg", 77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 78 79 static cl::opt<unsigned> FMAContractLevelOpt( 80 "nvptx-fma-level", cl::Hidden, 81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 82 " 1: do it 2: do it aggressively"), 83 cl::init(2)); 84 85 static cl::opt<int> UsePrecDivF32( 86 "nvptx-prec-divf32", cl::Hidden, 87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 88 " IEEE Compliant F32 div.rnd if available."), 89 cl::init(2)); 90 91 static cl::opt<bool> UsePrecSqrtF32( 92 "nvptx-prec-sqrtf32", cl::Hidden, 93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 94 cl::init(true)); 95 96 static cl::opt<bool> ForceMinByValParamAlign( 97 "nvptx-force-min-byval-param-align", cl::Hidden, 98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 99 " params of device functions."), 100 cl::init(false)); 101 102 int NVPTXTargetLowering::getDivF32Level() const { 103 if (UsePrecDivF32.getNumOccurrences() > 0) { 104 // If nvptx-prec-div32=N is used on the command-line, always honor it 105 return UsePrecDivF32; 106 } else { 107 // Otherwise, use div.approx if fast math is enabled 108 if (getTargetMachine().Options.UnsafeFPMath) 109 return 0; 110 else 111 return 2; 112 } 113 } 114 115 bool NVPTXTargetLowering::usePrecSqrtF32() const { 116 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 118 return UsePrecSqrtF32; 119 } else { 120 // Otherwise, use sqrt.approx if fast math is enabled 121 return !getTargetMachine().Options.UnsafeFPMath; 122 } 123 } 124 125 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 126 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 127 DenormalMode::PreserveSign; 128 } 129 130 static bool IsPTXVectorType(MVT VT) { 131 switch (VT.SimpleTy) { 132 default: 133 return false; 134 case MVT::v2i1: 135 case MVT::v4i1: 136 case MVT::v2i8: 137 case MVT::v4i8: 138 case MVT::v2i16: 139 case MVT::v4i16: 140 case MVT::v8i16: // <4 x i16x2> 141 case MVT::v2i32: 142 case MVT::v4i32: 143 case MVT::v2i64: 144 case MVT::v2f16: 145 case MVT::v4f16: 146 case MVT::v8f16: // <4 x f16x2> 147 case MVT::v2bf16: 148 case MVT::v4bf16: 149 case MVT::v8bf16: // <4 x bf16x2> 150 case MVT::v2f32: 151 case MVT::v4f32: 152 case MVT::v2f64: 153 return true; 154 } 155 } 156 157 static bool Is16bitsType(MVT VT) { 158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || 159 VT.SimpleTy == MVT::i16); 160 } 161 162 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 163 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 164 /// into their primitive components. 165 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 166 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 167 /// LowerCall, and LowerReturn. 168 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 170 SmallVectorImpl<uint64_t> *Offsets = nullptr, 171 uint64_t StartingOffset = 0) { 172 SmallVector<EVT, 16> TempVTs; 173 SmallVector<uint64_t, 16> TempOffsets; 174 175 // Special case for i128 - decompose to (i64, i64) 176 if (Ty->isIntegerTy(128)) { 177 ValueVTs.push_back(EVT(MVT::i64)); 178 ValueVTs.push_back(EVT(MVT::i64)); 179 180 if (Offsets) { 181 Offsets->push_back(StartingOffset + 0); 182 Offsets->push_back(StartingOffset + 8); 183 } 184 185 return; 186 } 187 188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 189 if (StructType *STy = dyn_cast<StructType>(Ty)) { 190 auto const *SL = DL.getStructLayout(STy); 191 auto ElementNum = 0; 192 for(auto *EI : STy->elements()) { 193 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 194 StartingOffset + SL->getElementOffset(ElementNum)); 195 ++ElementNum; 196 } 197 return; 198 } 199 200 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 202 EVT VT = TempVTs[i]; 203 uint64_t Off = TempOffsets[i]; 204 // Split vectors into individual elements, except for v2f16, which 205 // we will pass as a single scalar. 206 if (VT.isVector()) { 207 unsigned NumElts = VT.getVectorNumElements(); 208 EVT EltVT = VT.getVectorElementType(); 209 // Vectors with an even number of f16 elements will be passed to 210 // us as an array of v2f16/v2bf16 elements. We must match this so we 211 // stay in sync with Ins/Outs. 212 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 213 switch (EltVT.getSimpleVT().SimpleTy) { 214 case MVT::f16: 215 EltVT = MVT::v2f16; 216 break; 217 case MVT::bf16: 218 EltVT = MVT::v2bf16; 219 break; 220 case MVT::i16: 221 EltVT = MVT::v2i16; 222 break; 223 default: 224 llvm_unreachable("Unexpected type"); 225 } 226 NumElts /= 2; 227 } else if (EltVT.getSimpleVT() == MVT::i8 && 228 (NumElts % 4 == 0 || NumElts == 3)) { 229 // v*i8 are formally lowered as v4i8 230 EltVT = MVT::v4i8; 231 NumElts = (NumElts + 3) / 4; 232 } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { 233 // v2i8 is promoted to v2i16 234 NumElts = 1; 235 EltVT = MVT::v2i16; 236 } 237 for (unsigned j = 0; j != NumElts; ++j) { 238 ValueVTs.push_back(EltVT); 239 if (Offsets) 240 Offsets->push_back(Off + j * EltVT.getStoreSize()); 241 } 242 } else { 243 ValueVTs.push_back(VT); 244 if (Offsets) 245 Offsets->push_back(Off); 246 } 247 } 248 } 249 250 /// PromoteScalarIntegerPTX 251 /// Used to make sure the arguments/returns are suitable for passing 252 /// and promote them to a larger size if they're not. 253 /// 254 /// The promoted type is placed in \p PromoteVT if the function returns true. 255 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 256 if (VT.isScalarInteger()) { 257 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 258 default: 259 llvm_unreachable( 260 "Promotion is not suitable for scalars of size larger than 64-bits"); 261 case 1: 262 *PromotedVT = MVT::i1; 263 break; 264 case 2: 265 case 4: 266 case 8: 267 *PromotedVT = MVT::i8; 268 break; 269 case 16: 270 *PromotedVT = MVT::i16; 271 break; 272 case 32: 273 *PromotedVT = MVT::i32; 274 break; 275 case 64: 276 *PromotedVT = MVT::i64; 277 break; 278 } 279 return EVT(*PromotedVT) != VT; 280 } 281 return false; 282 } 283 284 // Check whether we can merge loads/stores of some of the pieces of a 285 // flattened function parameter or return value into a single vector 286 // load/store. 287 // 288 // The flattened parameter is represented as a list of EVTs and 289 // offsets, and the whole structure is aligned to ParamAlignment. This 290 // function determines whether we can load/store pieces of the 291 // parameter starting at index Idx using a single vectorized op of 292 // size AccessSize. If so, it returns the number of param pieces 293 // covered by the vector op. Otherwise, it returns 1. 294 static unsigned CanMergeParamLoadStoresStartingAt( 295 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 296 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 297 298 // Can't vectorize if param alignment is not sufficient. 299 if (ParamAlignment < AccessSize) 300 return 1; 301 // Can't vectorize if offset is not aligned. 302 if (Offsets[Idx] & (AccessSize - 1)) 303 return 1; 304 305 EVT EltVT = ValueVTs[Idx]; 306 unsigned EltSize = EltVT.getStoreSize(); 307 308 // Element is too large to vectorize. 309 if (EltSize >= AccessSize) 310 return 1; 311 312 unsigned NumElts = AccessSize / EltSize; 313 // Can't vectorize if AccessBytes if not a multiple of EltSize. 314 if (AccessSize != EltSize * NumElts) 315 return 1; 316 317 // We don't have enough elements to vectorize. 318 if (Idx + NumElts > ValueVTs.size()) 319 return 1; 320 321 // PTX ISA can only deal with 2- and 4-element vector ops. 322 if (NumElts != 4 && NumElts != 2) 323 return 1; 324 325 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 326 // Types do not match. 327 if (ValueVTs[j] != EltVT) 328 return 1; 329 330 // Elements are not contiguous. 331 if (Offsets[j] - Offsets[j - 1] != EltSize) 332 return 1; 333 } 334 // OK. We can vectorize ValueVTs[i..i+NumElts) 335 return NumElts; 336 } 337 338 // Flags for tracking per-element vectorization state of loads/stores 339 // of a flattened function parameter or return value. 340 enum ParamVectorizationFlags { 341 PVF_INNER = 0x0, // Middle elements of a vector. 342 PVF_FIRST = 0x1, // First element of the vector. 343 PVF_LAST = 0x2, // Last element of the vector. 344 // Scalar is effectively a 1-element vector. 345 PVF_SCALAR = PVF_FIRST | PVF_LAST 346 }; 347 348 // Computes whether and how we can vectorize the loads/stores of a 349 // flattened function parameter or return value. 350 // 351 // The flattened parameter is represented as the list of ValueVTs and 352 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 353 // of the same size as ValueVTs indicating how each piece should be 354 // loaded/stored (i.e. as a scalar, or as part of a vector 355 // load/store). 356 static SmallVector<ParamVectorizationFlags, 16> 357 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 358 const SmallVectorImpl<uint64_t> &Offsets, 359 Align ParamAlignment, bool IsVAArg = false) { 360 // Set vector size to match ValueVTs and mark all elements as 361 // scalars by default. 362 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 363 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 364 365 if (IsVAArg) 366 return VectorInfo; 367 368 // Check what we can vectorize using 128/64/32-bit accesses. 369 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 370 // Skip elements we've already processed. 371 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 372 for (unsigned AccessSize : {16, 8, 4, 2}) { 373 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 374 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 375 // Mark vectorized elements. 376 switch (NumElts) { 377 default: 378 llvm_unreachable("Unexpected return value"); 379 case 1: 380 // Can't vectorize using this size, try next smaller size. 381 continue; 382 case 2: 383 assert(I + 1 < E && "Not enough elements."); 384 VectorInfo[I] = PVF_FIRST; 385 VectorInfo[I + 1] = PVF_LAST; 386 I += 1; 387 break; 388 case 4: 389 assert(I + 3 < E && "Not enough elements."); 390 VectorInfo[I] = PVF_FIRST; 391 VectorInfo[I + 1] = PVF_INNER; 392 VectorInfo[I + 2] = PVF_INNER; 393 VectorInfo[I + 3] = PVF_LAST; 394 I += 3; 395 break; 396 } 397 // Break out of the inner loop because we've already succeeded 398 // using largest possible AccessSize. 399 break; 400 } 401 } 402 return VectorInfo; 403 } 404 405 // NVPTXTargetLowering Constructor. 406 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 407 const NVPTXSubtarget &STI) 408 : TargetLowering(TM), nvTM(&TM), STI(STI) { 409 // always lower memset, memcpy, and memmove intrinsics to load/store 410 // instructions, rather 411 // then generating calls to memset, mempcy or memmove. 412 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; 413 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; 414 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; 415 416 setBooleanContents(ZeroOrNegativeOneBooleanContent); 417 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 418 419 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 420 // condition branches. 421 setJumpIsExpensive(true); 422 423 // Wide divides are _very_ slow. Try to reduce the width of the divide if 424 // possible. 425 addBypassSlowDiv(64, 32); 426 427 // By default, use the Source scheduling 428 if (sched4reg) 429 setSchedulingPreference(Sched::RegPressure); 430 else 431 setSchedulingPreference(Sched::Source); 432 433 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 434 LegalizeAction NoF16Action) { 435 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 436 }; 437 438 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 439 LegalizeAction NoBF16Action) { 440 bool IsOpSupported = STI.hasBF16Math(); 441 // Few instructions are available on sm_90 only 442 switch(Op) { 443 case ISD::FADD: 444 case ISD::FMUL: 445 case ISD::FSUB: 446 case ISD::SELECT: 447 case ISD::SELECT_CC: 448 case ISD::SETCC: 449 case ISD::FEXP2: 450 case ISD::FCEIL: 451 case ISD::FFLOOR: 452 case ISD::FNEARBYINT: 453 case ISD::FRINT: 454 case ISD::FROUNDEVEN: 455 case ISD::FTRUNC: 456 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 457 break; 458 } 459 setOperationAction( 460 Op, VT, IsOpSupported ? Action : NoBF16Action); 461 }; 462 463 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 464 LegalizeAction NoI16x2Action) { 465 bool IsOpSupported = false; 466 // instructions are available on sm_90 only 467 switch (Op) { 468 case ISD::ADD: 469 case ISD::SMAX: 470 case ISD::SMIN: 471 case ISD::UMIN: 472 case ISD::UMAX: 473 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; 474 break; 475 } 476 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); 477 }; 478 479 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 480 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 481 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); 482 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); 483 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 484 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 485 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 486 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 487 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 488 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 489 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 490 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 491 492 // Conversion to/from FP16/FP16x2 is always legal. 493 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 494 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 495 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 496 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 497 498 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 499 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31) 500 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); 501 502 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 503 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 504 505 // Conversion to/from BFP16/BFP16x2 is always legal. 506 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 507 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 508 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 509 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 510 511 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 512 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 513 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) 514 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); 515 516 // Conversion to/from i16/i16x2 is always legal. 517 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); 518 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 519 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); 520 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); 521 522 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); 523 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); 524 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); 525 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); 526 // Only logical ops can be done on v4i8 directly, others must be done 527 // elementwise. 528 setOperationAction( 529 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, 530 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, 531 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, 532 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, 533 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, 534 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, 535 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, 536 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, 537 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, 538 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, 539 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, 540 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, 541 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, 542 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, 543 ISD::USUBSAT}, 544 MVT::v4i8, Expand); 545 546 // Operations not directly supported by NVPTX. 547 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 548 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, 549 MVT::i32, MVT::i64}) { 550 setOperationAction(ISD::SELECT_CC, VT, Expand); 551 setOperationAction(ISD::BR_CC, VT, Expand); 552 } 553 554 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 555 // For others we will expand to a SHL/SRA pair. 556 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 557 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 558 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 559 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 560 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 561 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 562 563 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 564 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 565 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 566 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 567 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 568 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 569 570 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 571 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 572 573 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 574 // that don't have h/w rotation we lower them to multi-instruction assembly. 575 // See ROT*_sw in NVPTXIntrInfo.td 576 setOperationAction(ISD::ROTL, MVT::i64, Legal); 577 setOperationAction(ISD::ROTR, MVT::i64, Legal); 578 setOperationAction(ISD::ROTL, MVT::i32, Legal); 579 setOperationAction(ISD::ROTR, MVT::i32, Legal); 580 581 setOperationAction(ISD::ROTL, MVT::i16, Expand); 582 setOperationAction(ISD::ROTL, MVT::v2i16, Expand); 583 setOperationAction(ISD::ROTR, MVT::i16, Expand); 584 setOperationAction(ISD::ROTR, MVT::v2i16, Expand); 585 setOperationAction(ISD::ROTL, MVT::i8, Expand); 586 setOperationAction(ISD::ROTR, MVT::i8, Expand); 587 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 588 589 // Indirect branch is not supported. 590 // This also disables Jump Table creation. 591 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 592 setOperationAction(ISD::BRIND, MVT::Other, Expand); 593 594 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 595 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 596 597 // We want to legalize constant related memmove and memcopy 598 // intrinsics. 599 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 600 601 // Turn FP extload into load/fpextend 602 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 603 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 604 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 605 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 606 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 607 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 608 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 609 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 610 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 611 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 612 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 613 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 614 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 615 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 616 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 617 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 618 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 619 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 620 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 621 // Turn FP truncstore into trunc + store. 622 // FIXME: vector types should also be expanded 623 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 624 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 625 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 626 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 627 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 628 629 // PTX does not support load / store predicate registers 630 setOperationAction(ISD::LOAD, MVT::i1, Custom); 631 setOperationAction(ISD::STORE, MVT::i1, Custom); 632 633 for (MVT VT : MVT::integer_valuetypes()) { 634 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 635 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 636 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 637 setTruncStoreAction(VT, MVT::i1, Expand); 638 } 639 640 // expand extload of vector of integers. 641 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, 642 MVT::v2i8, Expand); 643 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 644 645 // This is legal in NVPTX 646 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 647 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 648 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 649 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 650 651 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 652 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 653 654 // TRAP can be lowered to PTX trap 655 setOperationAction(ISD::TRAP, MVT::Other, Legal); 656 657 // Register custom handling for vector loads/stores 658 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 659 if (IsPTXVectorType(VT)) { 660 setOperationAction(ISD::LOAD, VT, Custom); 661 setOperationAction(ISD::STORE, VT, Custom); 662 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 663 } 664 } 665 666 // Support varargs. 667 setOperationAction(ISD::VASTART, MVT::Other, Custom); 668 setOperationAction(ISD::VAARG, MVT::Other, Custom); 669 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 670 setOperationAction(ISD::VAEND, MVT::Other, Expand); 671 672 // Custom handling for i8 intrinsics 673 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 674 675 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 676 setOperationAction(ISD::ABS, Ty, Legal); 677 setOperationAction(ISD::SMIN, Ty, Legal); 678 setOperationAction(ISD::SMAX, Ty, Legal); 679 setOperationAction(ISD::UMIN, Ty, Legal); 680 setOperationAction(ISD::UMAX, Ty, Legal); 681 682 setOperationAction(ISD::CTPOP, Ty, Legal); 683 setOperationAction(ISD::CTLZ, Ty, Legal); 684 } 685 686 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); 687 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); 688 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); 689 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); 690 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); 691 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); 692 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); 693 694 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); 695 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); 696 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); 697 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); 698 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); 699 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); 700 701 // Other arithmetic and logic ops are unsupported. 702 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, 703 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, 704 ISD::SINT_TO_FP, ISD::UINT_TO_FP}, 705 MVT::v2i16, Expand); 706 707 setOperationAction(ISD::ADDC, MVT::i32, Legal); 708 setOperationAction(ISD::ADDE, MVT::i32, Legal); 709 setOperationAction(ISD::SUBC, MVT::i32, Legal); 710 setOperationAction(ISD::SUBE, MVT::i32, Legal); 711 if (STI.getPTXVersion() >= 43) { 712 setOperationAction(ISD::ADDC, MVT::i64, Legal); 713 setOperationAction(ISD::ADDE, MVT::i64, Legal); 714 setOperationAction(ISD::SUBC, MVT::i64, Legal); 715 setOperationAction(ISD::SUBE, MVT::i64, Legal); 716 } 717 718 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 719 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); 720 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 721 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 722 723 // PTX does not directly support SELP of i1, so promote to i32 first 724 setOperationAction(ISD::SELECT, MVT::i1, Custom); 725 726 // PTX cannot multiply two i64s in a single instruction. 727 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 728 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 729 730 // We have some custom DAG combine patterns for these nodes 731 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, 732 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, 733 ISD::VSELECT}); 734 735 // setcc for f16x2 and bf16x2 needs special handling to prevent 736 // legalizer's attempt to scalarize it due to v2i1 not being legal. 737 if (STI.allowFP16Math() || STI.hasBF16Math()) 738 setTargetDAGCombine(ISD::SETCC); 739 740 // Promote fp16 arithmetic if fp16 hardware isn't available or the 741 // user passed --nvptx-no-fp16-math. The flag is useful because, 742 // although sm_53+ GPUs have some sort of FP16 support in 743 // hardware, only sm_53 and sm_60 have full implementation. Others 744 // only have token amount of hardware and are likely to run faster 745 // by using fp32 units instead. 746 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 747 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 748 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 749 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 750 // bf16 must be promoted to f32. 751 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 752 if (getOperationAction(Op, MVT::bf16) == Promote) 753 AddPromotedToType(Op, MVT::bf16, MVT::f32); 754 } 755 756 // f16/f16x2 neg was introduced in PTX 60, SM_53. 757 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 758 STI.getPTXVersion() >= 60 && 759 STI.allowFP16Math(); 760 for (const auto &VT : {MVT::f16, MVT::v2f16}) 761 setOperationAction(ISD::FNEG, VT, 762 IsFP16FP16x2NegAvailable ? Legal : Expand); 763 764 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 765 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 766 // (would be) Library functions. 767 768 // These map to conversion instructions for scalar FP types. 769 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 770 ISD::FROUNDEVEN, ISD::FTRUNC}) { 771 setOperationAction(Op, MVT::f16, Legal); 772 setOperationAction(Op, MVT::f32, Legal); 773 setOperationAction(Op, MVT::f64, Legal); 774 setOperationAction(Op, MVT::v2f16, Expand); 775 setOperationAction(Op, MVT::v2bf16, Expand); 776 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 777 if (getOperationAction(Op, MVT::bf16) == Promote) 778 AddPromotedToType(Op, MVT::bf16, MVT::f32); 779 } 780 781 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) { 782 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); 783 } 784 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 785 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) { 786 setOperationAction(ISD::FP_EXTEND, VT, Custom); 787 setOperationAction(ISD::FP_ROUND, VT, Custom); 788 } 789 } 790 791 // sm_80 only has conversions between f32 and bf16. Custom lower all other 792 // bf16 conversions. 793 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 794 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { 795 setOperationAction( 796 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 797 VT, Custom); 798 } 799 setOperationAction( 800 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 801 MVT::bf16, Custom); 802 } 803 804 setOperationAction(ISD::FROUND, MVT::f16, Promote); 805 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 806 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 807 setOperationAction(ISD::FROUND, MVT::f32, Custom); 808 setOperationAction(ISD::FROUND, MVT::f64, Custom); 809 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 810 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); 811 812 // 'Expand' implements FCOPYSIGN without calling an external library. 813 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 814 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 815 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 816 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 817 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 818 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 819 820 // These map to corresponding instructions for f32/f64. f16 must be 821 // promoted to f32. v2f16 is expanded to f16, which is then promoted 822 // to f32. 823 for (const auto &Op : 824 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { 825 setOperationAction(Op, MVT::f16, Promote); 826 setOperationAction(Op, MVT::f32, Legal); 827 setOperationAction(Op, MVT::f64, Legal); 828 setOperationAction(Op, MVT::v2f16, Expand); 829 setOperationAction(Op, MVT::v2bf16, Expand); 830 setOperationAction(Op, MVT::bf16, Promote); 831 AddPromotedToType(Op, MVT::bf16, MVT::f32); 832 } 833 for (const auto &Op : {ISD::FABS}) { 834 setOperationAction(Op, MVT::f16, Promote); 835 setOperationAction(Op, MVT::f32, Legal); 836 setOperationAction(Op, MVT::f64, Legal); 837 setOperationAction(Op, MVT::v2f16, Expand); 838 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 839 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 840 if (getOperationAction(Op, MVT::bf16) == Promote) 841 AddPromotedToType(Op, MVT::bf16, MVT::f32); 842 } 843 844 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 845 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 846 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 847 return IsAtLeastSm80 ? Legal : NotSm80Action; 848 }; 849 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 850 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 851 setOperationAction(Op, MVT::f32, Legal); 852 setOperationAction(Op, MVT::f64, Legal); 853 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 854 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 855 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 856 if (getOperationAction(Op, MVT::bf16) == Promote) 857 AddPromotedToType(Op, MVT::bf16, MVT::f32); 858 } 859 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 860 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 861 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 862 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 863 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 864 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 865 } 866 867 // Custom lowering for inline asm with 128-bit operands 868 setOperationAction(ISD::CopyToReg, MVT::i128, Custom); 869 setOperationAction(ISD::CopyFromReg, MVT::i128, Custom); 870 871 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 872 // No FPOW or FREM in PTX. 873 874 // Now deduce the information based on the above mentioned 875 // actions 876 computeRegisterProperties(STI.getRegisterInfo()); 877 878 setMinCmpXchgSizeInBits(32); 879 setMaxAtomicSizeInBitsSupported(64); 880 setMaxDivRemBitWidthSupported(64); 881 } 882 883 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 884 885 #define MAKE_CASE(V) \ 886 case V: \ 887 return #V; 888 889 switch ((NVPTXISD::NodeType)Opcode) { 890 case NVPTXISD::FIRST_NUMBER: 891 break; 892 893 MAKE_CASE(NVPTXISD::CALL) 894 MAKE_CASE(NVPTXISD::RET_GLUE) 895 MAKE_CASE(NVPTXISD::LOAD_PARAM) 896 MAKE_CASE(NVPTXISD::Wrapper) 897 MAKE_CASE(NVPTXISD::DeclareParam) 898 MAKE_CASE(NVPTXISD::DeclareScalarParam) 899 MAKE_CASE(NVPTXISD::DeclareRet) 900 MAKE_CASE(NVPTXISD::DeclareScalarRet) 901 MAKE_CASE(NVPTXISD::DeclareRetParam) 902 MAKE_CASE(NVPTXISD::PrintCall) 903 MAKE_CASE(NVPTXISD::PrintConvergentCall) 904 MAKE_CASE(NVPTXISD::PrintCallUni) 905 MAKE_CASE(NVPTXISD::PrintConvergentCallUni) 906 MAKE_CASE(NVPTXISD::LoadParam) 907 MAKE_CASE(NVPTXISD::LoadParamV2) 908 MAKE_CASE(NVPTXISD::LoadParamV4) 909 MAKE_CASE(NVPTXISD::StoreParam) 910 MAKE_CASE(NVPTXISD::StoreParamV2) 911 MAKE_CASE(NVPTXISD::StoreParamV4) 912 MAKE_CASE(NVPTXISD::StoreParamS32) 913 MAKE_CASE(NVPTXISD::StoreParamU32) 914 MAKE_CASE(NVPTXISD::CallArgBegin) 915 MAKE_CASE(NVPTXISD::CallArg) 916 MAKE_CASE(NVPTXISD::LastCallArg) 917 MAKE_CASE(NVPTXISD::CallArgEnd) 918 MAKE_CASE(NVPTXISD::CallVoid) 919 MAKE_CASE(NVPTXISD::CallVal) 920 MAKE_CASE(NVPTXISD::CallSymbol) 921 MAKE_CASE(NVPTXISD::Prototype) 922 MAKE_CASE(NVPTXISD::MoveParam) 923 MAKE_CASE(NVPTXISD::StoreRetval) 924 MAKE_CASE(NVPTXISD::StoreRetvalV2) 925 MAKE_CASE(NVPTXISD::StoreRetvalV4) 926 MAKE_CASE(NVPTXISD::PseudoUseParam) 927 MAKE_CASE(NVPTXISD::RETURN) 928 MAKE_CASE(NVPTXISD::CallSeqBegin) 929 MAKE_CASE(NVPTXISD::CallSeqEnd) 930 MAKE_CASE(NVPTXISD::CallPrototype) 931 MAKE_CASE(NVPTXISD::ProxyReg) 932 MAKE_CASE(NVPTXISD::LoadV2) 933 MAKE_CASE(NVPTXISD::LoadV4) 934 MAKE_CASE(NVPTXISD::LDGV2) 935 MAKE_CASE(NVPTXISD::LDGV4) 936 MAKE_CASE(NVPTXISD::LDUV2) 937 MAKE_CASE(NVPTXISD::LDUV4) 938 MAKE_CASE(NVPTXISD::StoreV2) 939 MAKE_CASE(NVPTXISD::StoreV4) 940 MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP) 941 MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP) 942 MAKE_CASE(NVPTXISD::IMAD) 943 MAKE_CASE(NVPTXISD::BFE) 944 MAKE_CASE(NVPTXISD::BFI) 945 MAKE_CASE(NVPTXISD::PRMT) 946 MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) 947 MAKE_CASE(NVPTXISD::SETP_F16X2) 948 MAKE_CASE(NVPTXISD::SETP_BF16X2) 949 MAKE_CASE(NVPTXISD::Dummy) 950 MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED) 951 MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED) 952 MAKE_CASE(NVPTXISD::Tex1DFloatS32) 953 MAKE_CASE(NVPTXISD::Tex1DFloatFloat) 954 MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel) 955 MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad) 956 MAKE_CASE(NVPTXISD::Tex1DS32S32) 957 MAKE_CASE(NVPTXISD::Tex1DS32Float) 958 MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel) 959 MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad) 960 MAKE_CASE(NVPTXISD::Tex1DU32S32) 961 MAKE_CASE(NVPTXISD::Tex1DU32Float) 962 MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel) 963 MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad) 964 MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32) 965 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat) 966 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel) 967 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad) 968 MAKE_CASE(NVPTXISD::Tex1DArrayS32S32) 969 MAKE_CASE(NVPTXISD::Tex1DArrayS32Float) 970 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel) 971 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad) 972 MAKE_CASE(NVPTXISD::Tex1DArrayU32S32) 973 MAKE_CASE(NVPTXISD::Tex1DArrayU32Float) 974 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel) 975 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad) 976 MAKE_CASE(NVPTXISD::Tex2DFloatS32) 977 MAKE_CASE(NVPTXISD::Tex2DFloatFloat) 978 MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel) 979 MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad) 980 MAKE_CASE(NVPTXISD::Tex2DS32S32) 981 MAKE_CASE(NVPTXISD::Tex2DS32Float) 982 MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel) 983 MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad) 984 MAKE_CASE(NVPTXISD::Tex2DU32S32) 985 MAKE_CASE(NVPTXISD::Tex2DU32Float) 986 MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel) 987 MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad) 988 MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32) 989 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat) 990 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel) 991 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad) 992 MAKE_CASE(NVPTXISD::Tex2DArrayS32S32) 993 MAKE_CASE(NVPTXISD::Tex2DArrayS32Float) 994 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel) 995 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad) 996 MAKE_CASE(NVPTXISD::Tex2DArrayU32S32) 997 MAKE_CASE(NVPTXISD::Tex2DArrayU32Float) 998 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel) 999 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad) 1000 MAKE_CASE(NVPTXISD::Tex3DFloatS32) 1001 MAKE_CASE(NVPTXISD::Tex3DFloatFloat) 1002 MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel) 1003 MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad) 1004 MAKE_CASE(NVPTXISD::Tex3DS32S32) 1005 MAKE_CASE(NVPTXISD::Tex3DS32Float) 1006 MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel) 1007 MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad) 1008 MAKE_CASE(NVPTXISD::Tex3DU32S32) 1009 MAKE_CASE(NVPTXISD::Tex3DU32Float) 1010 MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel) 1011 MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad) 1012 MAKE_CASE(NVPTXISD::TexCubeFloatFloat) 1013 MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel) 1014 MAKE_CASE(NVPTXISD::TexCubeS32Float) 1015 MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel) 1016 MAKE_CASE(NVPTXISD::TexCubeU32Float) 1017 MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel) 1018 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat) 1019 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel) 1020 MAKE_CASE(NVPTXISD::TexCubeArrayS32Float) 1021 MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel) 1022 MAKE_CASE(NVPTXISD::TexCubeArrayU32Float) 1023 MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel) 1024 MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat) 1025 MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat) 1026 MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat) 1027 MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat) 1028 MAKE_CASE(NVPTXISD::Tld4R2DS64Float) 1029 MAKE_CASE(NVPTXISD::Tld4G2DS64Float) 1030 MAKE_CASE(NVPTXISD::Tld4B2DS64Float) 1031 MAKE_CASE(NVPTXISD::Tld4A2DS64Float) 1032 MAKE_CASE(NVPTXISD::Tld4R2DU64Float) 1033 MAKE_CASE(NVPTXISD::Tld4G2DU64Float) 1034 MAKE_CASE(NVPTXISD::Tld4B2DU64Float) 1035 MAKE_CASE(NVPTXISD::Tld4A2DU64Float) 1036 1037 MAKE_CASE(NVPTXISD::TexUnified1DFloatS32) 1038 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat) 1039 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel) 1040 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad) 1041 MAKE_CASE(NVPTXISD::TexUnified1DS32S32) 1042 MAKE_CASE(NVPTXISD::TexUnified1DS32Float) 1043 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel) 1044 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad) 1045 MAKE_CASE(NVPTXISD::TexUnified1DU32S32) 1046 MAKE_CASE(NVPTXISD::TexUnified1DU32Float) 1047 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel) 1048 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad) 1049 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32) 1050 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat) 1051 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel) 1052 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad) 1053 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32) 1054 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float) 1055 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel) 1056 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad) 1057 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32) 1058 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float) 1059 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel) 1060 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad) 1061 MAKE_CASE(NVPTXISD::TexUnified2DFloatS32) 1062 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat) 1063 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel) 1064 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad) 1065 MAKE_CASE(NVPTXISD::TexUnified2DS32S32) 1066 MAKE_CASE(NVPTXISD::TexUnified2DS32Float) 1067 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel) 1068 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad) 1069 MAKE_CASE(NVPTXISD::TexUnified2DU32S32) 1070 MAKE_CASE(NVPTXISD::TexUnified2DU32Float) 1071 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel) 1072 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad) 1073 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32) 1074 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat) 1075 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel) 1076 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad) 1077 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32) 1078 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float) 1079 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel) 1080 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad) 1081 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32) 1082 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float) 1083 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel) 1084 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad) 1085 MAKE_CASE(NVPTXISD::TexUnified3DFloatS32) 1086 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat) 1087 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel) 1088 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad) 1089 MAKE_CASE(NVPTXISD::TexUnified3DS32S32) 1090 MAKE_CASE(NVPTXISD::TexUnified3DS32Float) 1091 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel) 1092 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad) 1093 MAKE_CASE(NVPTXISD::TexUnified3DU32S32) 1094 MAKE_CASE(NVPTXISD::TexUnified3DU32Float) 1095 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel) 1096 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad) 1097 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat) 1098 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel) 1099 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float) 1100 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel) 1101 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float) 1102 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel) 1103 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat) 1104 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel) 1105 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float) 1106 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel) 1107 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float) 1108 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel) 1109 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad) 1110 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad) 1111 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad) 1112 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad) 1113 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad) 1114 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad) 1115 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat) 1116 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat) 1117 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat) 1118 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat) 1119 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float) 1120 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float) 1121 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float) 1122 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float) 1123 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float) 1124 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float) 1125 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float) 1126 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float) 1127 1128 MAKE_CASE(NVPTXISD::Suld1DI8Clamp) 1129 MAKE_CASE(NVPTXISD::Suld1DI16Clamp) 1130 MAKE_CASE(NVPTXISD::Suld1DI32Clamp) 1131 MAKE_CASE(NVPTXISD::Suld1DI64Clamp) 1132 MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp) 1133 MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp) 1134 MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp) 1135 MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp) 1136 MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp) 1137 MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp) 1138 MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp) 1139 1140 MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp) 1141 MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp) 1142 MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp) 1143 MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp) 1144 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp) 1145 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp) 1146 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp) 1147 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp) 1148 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp) 1149 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp) 1150 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp) 1151 1152 MAKE_CASE(NVPTXISD::Suld2DI8Clamp) 1153 MAKE_CASE(NVPTXISD::Suld2DI16Clamp) 1154 MAKE_CASE(NVPTXISD::Suld2DI32Clamp) 1155 MAKE_CASE(NVPTXISD::Suld2DI64Clamp) 1156 MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp) 1157 MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp) 1158 MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp) 1159 MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp) 1160 MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp) 1161 MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp) 1162 MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp) 1163 1164 MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp) 1165 MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp) 1166 MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp) 1167 MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp) 1168 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp) 1169 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp) 1170 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp) 1171 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp) 1172 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp) 1173 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp) 1174 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp) 1175 1176 MAKE_CASE(NVPTXISD::Suld3DI8Clamp) 1177 MAKE_CASE(NVPTXISD::Suld3DI16Clamp) 1178 MAKE_CASE(NVPTXISD::Suld3DI32Clamp) 1179 MAKE_CASE(NVPTXISD::Suld3DI64Clamp) 1180 MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp) 1181 MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp) 1182 MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp) 1183 MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp) 1184 MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp) 1185 MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp) 1186 MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp) 1187 1188 MAKE_CASE(NVPTXISD::Suld1DI8Trap) 1189 MAKE_CASE(NVPTXISD::Suld1DI16Trap) 1190 MAKE_CASE(NVPTXISD::Suld1DI32Trap) 1191 MAKE_CASE(NVPTXISD::Suld1DI64Trap) 1192 MAKE_CASE(NVPTXISD::Suld1DV2I8Trap) 1193 MAKE_CASE(NVPTXISD::Suld1DV2I16Trap) 1194 MAKE_CASE(NVPTXISD::Suld1DV2I32Trap) 1195 MAKE_CASE(NVPTXISD::Suld1DV2I64Trap) 1196 MAKE_CASE(NVPTXISD::Suld1DV4I8Trap) 1197 MAKE_CASE(NVPTXISD::Suld1DV4I16Trap) 1198 MAKE_CASE(NVPTXISD::Suld1DV4I32Trap) 1199 1200 MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap) 1201 MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap) 1202 MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap) 1203 MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap) 1204 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap) 1205 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap) 1206 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap) 1207 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap) 1208 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap) 1209 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap) 1210 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap) 1211 1212 MAKE_CASE(NVPTXISD::Suld2DI8Trap) 1213 MAKE_CASE(NVPTXISD::Suld2DI16Trap) 1214 MAKE_CASE(NVPTXISD::Suld2DI32Trap) 1215 MAKE_CASE(NVPTXISD::Suld2DI64Trap) 1216 MAKE_CASE(NVPTXISD::Suld2DV2I8Trap) 1217 MAKE_CASE(NVPTXISD::Suld2DV2I16Trap) 1218 MAKE_CASE(NVPTXISD::Suld2DV2I32Trap) 1219 MAKE_CASE(NVPTXISD::Suld2DV2I64Trap) 1220 MAKE_CASE(NVPTXISD::Suld2DV4I8Trap) 1221 MAKE_CASE(NVPTXISD::Suld2DV4I16Trap) 1222 MAKE_CASE(NVPTXISD::Suld2DV4I32Trap) 1223 1224 MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap) 1225 MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap) 1226 MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap) 1227 MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap) 1228 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap) 1229 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap) 1230 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap) 1231 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap) 1232 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap) 1233 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap) 1234 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap) 1235 1236 MAKE_CASE(NVPTXISD::Suld3DI8Trap) 1237 MAKE_CASE(NVPTXISD::Suld3DI16Trap) 1238 MAKE_CASE(NVPTXISD::Suld3DI32Trap) 1239 MAKE_CASE(NVPTXISD::Suld3DI64Trap) 1240 MAKE_CASE(NVPTXISD::Suld3DV2I8Trap) 1241 MAKE_CASE(NVPTXISD::Suld3DV2I16Trap) 1242 MAKE_CASE(NVPTXISD::Suld3DV2I32Trap) 1243 MAKE_CASE(NVPTXISD::Suld3DV2I64Trap) 1244 MAKE_CASE(NVPTXISD::Suld3DV4I8Trap) 1245 MAKE_CASE(NVPTXISD::Suld3DV4I16Trap) 1246 MAKE_CASE(NVPTXISD::Suld3DV4I32Trap) 1247 1248 MAKE_CASE(NVPTXISD::Suld1DI8Zero) 1249 MAKE_CASE(NVPTXISD::Suld1DI16Zero) 1250 MAKE_CASE(NVPTXISD::Suld1DI32Zero) 1251 MAKE_CASE(NVPTXISD::Suld1DI64Zero) 1252 MAKE_CASE(NVPTXISD::Suld1DV2I8Zero) 1253 MAKE_CASE(NVPTXISD::Suld1DV2I16Zero) 1254 MAKE_CASE(NVPTXISD::Suld1DV2I32Zero) 1255 MAKE_CASE(NVPTXISD::Suld1DV2I64Zero) 1256 MAKE_CASE(NVPTXISD::Suld1DV4I8Zero) 1257 MAKE_CASE(NVPTXISD::Suld1DV4I16Zero) 1258 MAKE_CASE(NVPTXISD::Suld1DV4I32Zero) 1259 1260 MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero) 1261 MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero) 1262 MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero) 1263 MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero) 1264 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero) 1265 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero) 1266 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero) 1267 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero) 1268 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero) 1269 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero) 1270 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero) 1271 1272 MAKE_CASE(NVPTXISD::Suld2DI8Zero) 1273 MAKE_CASE(NVPTXISD::Suld2DI16Zero) 1274 MAKE_CASE(NVPTXISD::Suld2DI32Zero) 1275 MAKE_CASE(NVPTXISD::Suld2DI64Zero) 1276 MAKE_CASE(NVPTXISD::Suld2DV2I8Zero) 1277 MAKE_CASE(NVPTXISD::Suld2DV2I16Zero) 1278 MAKE_CASE(NVPTXISD::Suld2DV2I32Zero) 1279 MAKE_CASE(NVPTXISD::Suld2DV2I64Zero) 1280 MAKE_CASE(NVPTXISD::Suld2DV4I8Zero) 1281 MAKE_CASE(NVPTXISD::Suld2DV4I16Zero) 1282 MAKE_CASE(NVPTXISD::Suld2DV4I32Zero) 1283 1284 MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero) 1285 MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero) 1286 MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero) 1287 MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero) 1288 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero) 1289 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero) 1290 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero) 1291 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero) 1292 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero) 1293 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero) 1294 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero) 1295 1296 MAKE_CASE(NVPTXISD::Suld3DI8Zero) 1297 MAKE_CASE(NVPTXISD::Suld3DI16Zero) 1298 MAKE_CASE(NVPTXISD::Suld3DI32Zero) 1299 MAKE_CASE(NVPTXISD::Suld3DI64Zero) 1300 MAKE_CASE(NVPTXISD::Suld3DV2I8Zero) 1301 MAKE_CASE(NVPTXISD::Suld3DV2I16Zero) 1302 MAKE_CASE(NVPTXISD::Suld3DV2I32Zero) 1303 MAKE_CASE(NVPTXISD::Suld3DV2I64Zero) 1304 MAKE_CASE(NVPTXISD::Suld3DV4I8Zero) 1305 MAKE_CASE(NVPTXISD::Suld3DV4I16Zero) 1306 MAKE_CASE(NVPTXISD::Suld3DV4I32Zero) 1307 } 1308 return nullptr; 1309 1310 #undef MAKE_CASE 1311 } 1312 1313 TargetLoweringBase::LegalizeTypeAction 1314 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1315 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1316 VT.getScalarType() == MVT::i1) 1317 return TypeSplitVector; 1318 if (Isv2x16VT(VT)) 1319 return TypeLegal; 1320 return TargetLoweringBase::getPreferredVectorAction(VT); 1321 } 1322 1323 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1324 int Enabled, int &ExtraSteps, 1325 bool &UseOneConst, 1326 bool Reciprocal) const { 1327 if (!(Enabled == ReciprocalEstimate::Enabled || 1328 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1329 return SDValue(); 1330 1331 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1332 ExtraSteps = 0; 1333 1334 SDLoc DL(Operand); 1335 EVT VT = Operand.getValueType(); 1336 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1337 1338 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1339 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1340 DAG.getConstant(IID, DL, MVT::i32), Operand); 1341 }; 1342 1343 // The sqrt and rsqrt refinement processes assume we always start out with an 1344 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1345 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1346 // any refinement, we must return a regular sqrt. 1347 if (Reciprocal || ExtraSteps > 0) { 1348 if (VT == MVT::f32) 1349 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1350 : Intrinsic::nvvm_rsqrt_approx_f); 1351 else if (VT == MVT::f64) 1352 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1353 else 1354 return SDValue(); 1355 } else { 1356 if (VT == MVT::f32) 1357 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1358 : Intrinsic::nvvm_sqrt_approx_f); 1359 else { 1360 // There's no sqrt.approx.f64 instruction, so we emit 1361 // reciprocal(rsqrt(x)). This is faster than 1362 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1363 // x * rsqrt(x).) 1364 return DAG.getNode( 1365 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1366 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1367 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1368 } 1369 } 1370 } 1371 1372 SDValue 1373 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1374 SDLoc dl(Op); 1375 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1376 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1377 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1378 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1379 } 1380 1381 static bool IsTypePassedAsArray(const Type *Ty) { 1382 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1383 Ty->isHalfTy() || Ty->isBFloatTy(); 1384 } 1385 1386 std::string NVPTXTargetLowering::getPrototype( 1387 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1388 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1389 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1390 const CallBase &CB, unsigned UniqueCallSite) const { 1391 auto PtrVT = getPointerTy(DL); 1392 1393 bool isABI = (STI.getSmVersion() >= 20); 1394 assert(isABI && "Non-ABI compilation is not supported"); 1395 if (!isABI) 1396 return ""; 1397 1398 std::string Prototype; 1399 raw_string_ostream O(Prototype); 1400 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1401 1402 if (retTy->getTypeID() == Type::VoidTyID) { 1403 O << "()"; 1404 } else { 1405 O << "("; 1406 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1407 !IsTypePassedAsArray(retTy)) { 1408 unsigned size = 0; 1409 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1410 size = ITy->getBitWidth(); 1411 } else { 1412 assert(retTy->isFloatingPointTy() && 1413 "Floating point type expected here"); 1414 size = retTy->getPrimitiveSizeInBits(); 1415 } 1416 // PTX ABI requires all scalar return values to be at least 32 1417 // bits in size. fp16 normally uses .b16 as its storage type in 1418 // PTX, so its size must be adjusted here, too. 1419 size = promoteScalarArgumentSize(size); 1420 1421 O << ".param .b" << size << " _"; 1422 } else if (isa<PointerType>(retTy)) { 1423 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1424 } else if (IsTypePassedAsArray(retTy)) { 1425 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1426 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1427 } else { 1428 llvm_unreachable("Unknown return type"); 1429 } 1430 O << ") "; 1431 } 1432 O << "_ ("; 1433 1434 bool first = true; 1435 1436 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1437 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1438 Type *Ty = Args[i].Ty; 1439 if (!first) { 1440 O << ", "; 1441 } 1442 first = false; 1443 1444 if (!Outs[OIdx].Flags.isByVal()) { 1445 if (IsTypePassedAsArray(Ty)) { 1446 Align ParamAlign = 1447 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL); 1448 O << ".param .align " << ParamAlign.value() << " .b8 "; 1449 O << "_"; 1450 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1451 // update the index for Outs 1452 SmallVector<EVT, 16> vtparts; 1453 ComputeValueVTs(*this, DL, Ty, vtparts); 1454 if (unsigned len = vtparts.size()) 1455 OIdx += len - 1; 1456 continue; 1457 } 1458 // i8 types in IR will be i16 types in SDAG 1459 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1460 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1461 "type mismatch between callee prototype and arguments"); 1462 // scalar type 1463 unsigned sz = 0; 1464 if (isa<IntegerType>(Ty)) { 1465 sz = cast<IntegerType>(Ty)->getBitWidth(); 1466 sz = promoteScalarArgumentSize(sz); 1467 } else if (isa<PointerType>(Ty)) { 1468 sz = PtrVT.getSizeInBits(); 1469 } else { 1470 sz = Ty->getPrimitiveSizeInBits(); 1471 } 1472 O << ".param .b" << sz << " "; 1473 O << "_"; 1474 continue; 1475 } 1476 1477 // Indirect calls need strict ABI alignment so we disable optimizations by 1478 // not providing a function to optimize. 1479 Type *ETy = Args[i].IndirectType; 1480 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1481 Align ParamByValAlign = 1482 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL); 1483 1484 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1485 O << "_"; 1486 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1487 } 1488 1489 if (VAInfo) 1490 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1491 << " .b8 _[]\n"; 1492 O << ")"; 1493 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1494 O << " .noreturn"; 1495 O << ";"; 1496 1497 return Prototype; 1498 } 1499 1500 Align NVPTXTargetLowering::getFunctionArgumentAlignment( 1501 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const { 1502 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL)); 1503 } 1504 1505 Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, 1506 unsigned Idx, 1507 const DataLayout &DL) const { 1508 if (!CB) { 1509 // CallSite is zero, fallback to ABI type alignment 1510 return DL.getABITypeAlign(Ty); 1511 } 1512 1513 const Function *DirectCallee = CB->getCalledFunction(); 1514 1515 if (!DirectCallee) { 1516 // We don't have a direct function symbol, but that may be because of 1517 // constant cast instructions in the call. 1518 1519 // With bitcast'd call targets, the instruction will be the call 1520 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1521 // Check if we have call alignment metadata 1522 if (MaybeAlign StackAlign = getAlign(*CI, Idx)) 1523 return StackAlign.value(); 1524 } 1525 DirectCallee = getMaybeBitcastedCallee(CB); 1526 } 1527 1528 // Check for function alignment information if we found that the 1529 // ultimate target is a Function 1530 if (DirectCallee) 1531 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL); 1532 1533 // Call is indirect, fall back to the ABI type alignment 1534 return DL.getABITypeAlign(Ty); 1535 } 1536 1537 static bool adjustElementType(EVT &ElementType) { 1538 switch (ElementType.getSimpleVT().SimpleTy) { 1539 default: 1540 return false; 1541 case MVT::f16: 1542 case MVT::bf16: 1543 ElementType = MVT::i16; 1544 return true; 1545 case MVT::f32: 1546 case MVT::v2f16: 1547 case MVT::v2bf16: 1548 ElementType = MVT::i32; 1549 return true; 1550 case MVT::f64: 1551 ElementType = MVT::i64; 1552 return true; 1553 } 1554 } 1555 1556 // Use byte-store when the param address of the argument value is unaligned. 1557 // This may happen when the return value is a field of a packed structure. 1558 // 1559 // This is called in LowerCall() when passing the param values. 1560 static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, 1561 uint64_t Offset, EVT ElementType, 1562 SDValue StVal, SDValue &InGlue, 1563 unsigned ArgID, const SDLoc &dl) { 1564 // Bit logic only works on integer types 1565 if (adjustElementType(ElementType)) 1566 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); 1567 1568 // Store each byte 1569 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1570 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 1571 // Shift the byte to the last byte position 1572 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, 1573 DAG.getConstant(i * 8, dl, MVT::i32)); 1574 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), 1575 DAG.getConstant(Offset + i, dl, MVT::i32), 1576 ShiftVal, InGlue}; 1577 // Trunc store only the last byte by using 1578 // st.param.b8 1579 // The register type can be larger than b8. 1580 Chain = DAG.getMemIntrinsicNode( 1581 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, 1582 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 1583 InGlue = Chain.getValue(1); 1584 } 1585 return Chain; 1586 } 1587 1588 // Use byte-load when the param adress of the returned value is unaligned. 1589 // This may happen when the returned value is a field of a packed structure. 1590 static SDValue 1591 LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, 1592 EVT ElementType, SDValue &InGlue, 1593 SmallVectorImpl<SDValue> &TempProxyRegOps, 1594 const SDLoc &dl) { 1595 // Bit logic only works on integer types 1596 EVT MergedType = ElementType; 1597 adjustElementType(MergedType); 1598 1599 // Load each byte and construct the whole value. Initial value to 0 1600 SDValue RetVal = DAG.getConstant(0, dl, MergedType); 1601 // LoadParamMemI8 loads into i16 register only 1602 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); 1603 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 1604 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), 1605 DAG.getConstant(Offset + i, dl, MVT::i32), 1606 InGlue}; 1607 // This will be selected to LoadParamMemI8 1608 SDValue LdVal = 1609 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, 1610 MVT::i8, MachinePointerInfo(), Align(1)); 1611 SDValue TmpLdVal = LdVal.getValue(0); 1612 Chain = LdVal.getValue(1); 1613 InGlue = LdVal.getValue(2); 1614 1615 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, 1616 TmpLdVal.getSimpleValueType(), TmpLdVal); 1617 TempProxyRegOps.push_back(TmpLdVal); 1618 1619 SDValue CMask = DAG.getConstant(255, dl, MergedType); 1620 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); 1621 // Need to extend the i16 register to the whole width. 1622 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); 1623 // Mask off the high bits. Leave only the lower 8bits. 1624 // Do this because we are using loadparam.b8. 1625 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); 1626 // Shift and merge 1627 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); 1628 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); 1629 } 1630 if (ElementType != MergedType) 1631 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); 1632 1633 return RetVal; 1634 } 1635 1636 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1637 SmallVectorImpl<SDValue> &InVals) const { 1638 1639 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1640 report_fatal_error( 1641 "Support for variadic functions (unsized array parameter) introduced " 1642 "in PTX ISA version 6.0 and requires target sm_30."); 1643 1644 SelectionDAG &DAG = CLI.DAG; 1645 SDLoc dl = CLI.DL; 1646 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1647 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1648 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1649 SDValue Chain = CLI.Chain; 1650 SDValue Callee = CLI.Callee; 1651 bool &isTailCall = CLI.IsTailCall; 1652 ArgListTy &Args = CLI.getArgs(); 1653 Type *RetTy = CLI.RetTy; 1654 const CallBase *CB = CLI.CB; 1655 const DataLayout &DL = DAG.getDataLayout(); 1656 1657 bool isABI = (STI.getSmVersion() >= 20); 1658 assert(isABI && "Non-ABI compilation is not supported"); 1659 if (!isABI) 1660 return Chain; 1661 1662 // Variadic arguments. 1663 // 1664 // Normally, for each argument, we declare a param scalar or a param 1665 // byte array in the .param space, and store the argument value to that 1666 // param scalar or array starting at offset 0. 1667 // 1668 // In the case of the first variadic argument, we declare a vararg byte array 1669 // with size 0. The exact size of this array isn't known at this point, so 1670 // it'll be patched later. All the variadic arguments will be stored to this 1671 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1672 // initially set to 0, so it can be used for non-variadic arguments (which use 1673 // 0 offset) to simplify the code. 1674 // 1675 // After all vararg is processed, 'VAOffset' holds the size of the 1676 // vararg byte array. 1677 1678 SDValue VADeclareParam; // vararg byte array 1679 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1680 unsigned VAOffset = 0; // current offset in the param array 1681 1682 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1683 SDValue TempChain = Chain; 1684 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1685 SDValue InGlue = Chain.getValue(1); 1686 1687 unsigned ParamCount = 0; 1688 // Args.size() and Outs.size() need not match. 1689 // Outs.size() will be larger 1690 // * if there is an aggregate argument with multiple fields (each field 1691 // showing up separately in Outs) 1692 // * if there is a vector argument with more than typical vector-length 1693 // elements (generally if more than 4) where each vector element is 1694 // individually present in Outs. 1695 // So a different index should be used for indexing into Outs/OutVals. 1696 // See similar issue in LowerFormalArguments. 1697 unsigned OIdx = 0; 1698 // Declare the .params or .reg need to pass values 1699 // to the function 1700 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1701 EVT VT = Outs[OIdx].VT; 1702 Type *Ty = Args[i].Ty; 1703 bool IsVAArg = (i >= CLI.NumFixedArgs); 1704 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1705 1706 SmallVector<EVT, 16> VTs; 1707 SmallVector<uint64_t, 16> Offsets; 1708 1709 assert((!IsByVal || Args[i].IndirectType) && 1710 "byval arg must have indirect type"); 1711 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1712 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1713 1714 Align ArgAlign; 1715 if (IsByVal) { 1716 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1717 // so we don't need to worry whether it's naturally aligned or not. 1718 // See TargetLowering::LowerCallTo(). 1719 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1720 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1721 InitialAlign, DL); 1722 if (IsVAArg) 1723 VAOffset = alignTo(VAOffset, ArgAlign); 1724 } else { 1725 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL); 1726 } 1727 1728 unsigned TypeSize = 1729 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1730 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1731 1732 bool NeedAlign; // Does argument declaration specify alignment? 1733 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1734 if (IsVAArg) { 1735 if (ParamCount == FirstVAArg) { 1736 SDValue DeclareParamOps[] = { 1737 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1738 DAG.getConstant(ParamCount, dl, MVT::i32), 1739 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1740 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1741 DeclareParamVTs, DeclareParamOps); 1742 } 1743 NeedAlign = PassAsArray; 1744 } else if (PassAsArray) { 1745 // declare .param .align <align> .b8 .param<n>[<size>]; 1746 SDValue DeclareParamOps[] = { 1747 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1748 DAG.getConstant(ParamCount, dl, MVT::i32), 1749 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1750 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1751 DeclareParamOps); 1752 NeedAlign = true; 1753 } else { 1754 // declare .param .b<size> .param<n>; 1755 if (VT.isInteger() || VT.isFloatingPoint()) { 1756 // PTX ABI requires integral types to be at least 32 bits in 1757 // size. FP16 is loaded/stored using i16, so it's handled 1758 // here as well. 1759 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1760 } 1761 SDValue DeclareScalarParamOps[] = { 1762 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1763 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1764 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1765 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1766 DeclareScalarParamOps); 1767 NeedAlign = false; 1768 } 1769 InGlue = Chain.getValue(1); 1770 1771 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1772 // than 32-bits are sign extended or zero extended, depending on 1773 // whether they are signed or unsigned types. This case applies 1774 // only to scalar parameters and not to aggregate values. 1775 bool ExtendIntegerParam = 1776 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1777 1778 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1779 SmallVector<SDValue, 6> StoreOperands; 1780 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1781 EVT EltVT = VTs[j]; 1782 int CurOffset = Offsets[j]; 1783 MaybeAlign PartAlign; 1784 if (NeedAlign) 1785 PartAlign = commonAlignment(ArgAlign, CurOffset); 1786 1787 SDValue StVal = OutVals[OIdx]; 1788 1789 MVT PromotedVT; 1790 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1791 EltVT = EVT(PromotedVT); 1792 } 1793 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1794 llvm::ISD::NodeType Ext = 1795 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1796 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1797 } 1798 1799 if (IsByVal) { 1800 auto PtrVT = getPointerTy(DL); 1801 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1802 DAG.getConstant(CurOffset, dl, PtrVT)); 1803 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1804 PartAlign); 1805 } else if (ExtendIntegerParam) { 1806 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1807 // zext/sext to i32 1808 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1809 : ISD::ZERO_EXTEND, 1810 dl, MVT::i32, StVal); 1811 } 1812 1813 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1814 // Use 16-bit registers for small stores as it's the 1815 // smallest general purpose register size supported by NVPTX. 1816 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1817 } 1818 1819 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 1820 // scalar store. In such cases, fall back to byte stores. 1821 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && 1822 PartAlign.value() < 1823 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { 1824 assert(StoreOperands.empty() && "Unfinished preceeding store."); 1825 Chain = LowerUnalignedStoreParam( 1826 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, 1827 StVal, InGlue, ParamCount, dl); 1828 1829 // LowerUnalignedStoreParam took care of inserting the necessary nodes 1830 // into the SDAG, so just move on to the next element. 1831 if (!IsByVal) 1832 ++OIdx; 1833 continue; 1834 } 1835 1836 // New store. 1837 if (VectorInfo[j] & PVF_FIRST) { 1838 assert(StoreOperands.empty() && "Unfinished preceding store."); 1839 StoreOperands.push_back(Chain); 1840 StoreOperands.push_back( 1841 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1842 1843 StoreOperands.push_back(DAG.getConstant( 1844 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1845 dl, MVT::i32)); 1846 } 1847 1848 // Record the value to store. 1849 StoreOperands.push_back(StVal); 1850 1851 if (VectorInfo[j] & PVF_LAST) { 1852 unsigned NumElts = StoreOperands.size() - 3; 1853 NVPTXISD::NodeType Op; 1854 switch (NumElts) { 1855 case 1: 1856 Op = NVPTXISD::StoreParam; 1857 break; 1858 case 2: 1859 Op = NVPTXISD::StoreParamV2; 1860 break; 1861 case 4: 1862 Op = NVPTXISD::StoreParamV4; 1863 break; 1864 default: 1865 llvm_unreachable("Invalid vector info."); 1866 } 1867 1868 StoreOperands.push_back(InGlue); 1869 1870 // Adjust type of the store op if we've extended the scalar 1871 // return value. 1872 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1873 1874 Chain = DAG.getMemIntrinsicNode( 1875 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1876 TheStoreType, MachinePointerInfo(), PartAlign, 1877 MachineMemOperand::MOStore); 1878 InGlue = Chain.getValue(1); 1879 1880 // Cleanup. 1881 StoreOperands.clear(); 1882 1883 // TODO: We may need to support vector types that can be passed 1884 // as scalars in variadic arguments. 1885 if (!IsByVal && IsVAArg) { 1886 assert(NumElts == 1 && 1887 "Vectorization is expected to be disabled for variadics."); 1888 VAOffset += DL.getTypeAllocSize( 1889 TheStoreType.getTypeForEVT(*DAG.getContext())); 1890 } 1891 } 1892 if (!IsByVal) 1893 ++OIdx; 1894 } 1895 assert(StoreOperands.empty() && "Unfinished parameter store."); 1896 if (!IsByVal && VTs.size() > 0) 1897 --OIdx; 1898 ++ParamCount; 1899 if (IsByVal && IsVAArg) 1900 VAOffset += TypeSize; 1901 } 1902 1903 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1904 MaybeAlign retAlignment = std::nullopt; 1905 1906 // Handle Result 1907 if (Ins.size() > 0) { 1908 SmallVector<EVT, 16> resvtparts; 1909 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1910 1911 // Declare 1912 // .param .align N .b8 retval0[<size-in-bytes>], or 1913 // .param .b<size-in-bits> retval0 1914 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1915 if (!IsTypePassedAsArray(RetTy)) { 1916 resultsz = promoteScalarArgumentSize(resultsz); 1917 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1918 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1919 DAG.getConstant(resultsz, dl, MVT::i32), 1920 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1921 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1922 DeclareRetOps); 1923 InGlue = Chain.getValue(1); 1924 } else { 1925 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL); 1926 assert(retAlignment && "retAlignment is guaranteed to be set"); 1927 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1928 SDValue DeclareRetOps[] = { 1929 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1930 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1931 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1932 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1933 DeclareRetOps); 1934 InGlue = Chain.getValue(1); 1935 } 1936 } 1937 1938 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1939 // Set the size of the vararg param byte array if the callee is a variadic 1940 // function and the variadic part is not empty. 1941 if (HasVAArgs) { 1942 SDValue DeclareParamOps[] = { 1943 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1944 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1945 VADeclareParam.getOperand(4)}; 1946 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1947 VADeclareParam->getVTList(), DeclareParamOps); 1948 } 1949 1950 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1951 // between them we must rely on the call site value which is valid for 1952 // indirect calls but is always null for libcalls. 1953 bool isIndirectCall = !Func && CB; 1954 1955 if (isa<ExternalSymbolSDNode>(Callee)) { 1956 Function* CalleeFunc = nullptr; 1957 1958 // Try to find the callee in the current module. 1959 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1960 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1961 1962 // Set the "libcall callee" attribute to indicate that the function 1963 // must always have a declaration. 1964 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1965 } 1966 1967 if (isIndirectCall) { 1968 // This is indirect function call case : PTX requires a prototype of the 1969 // form 1970 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1971 // to be emitted, and the label has to used as the last arg of call 1972 // instruction. 1973 // The prototype is embedded in a string and put as the operand for a 1974 // CallPrototype SDNode which will print out to the value of the string. 1975 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1976 std::string Proto = getPrototype( 1977 DL, RetTy, Args, Outs, retAlignment, 1978 HasVAArgs 1979 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 1980 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1))) 1981 : std::nullopt, 1982 *CB, UniqueCallSite); 1983 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 1984 SDValue ProtoOps[] = { 1985 Chain, 1986 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 1987 InGlue, 1988 }; 1989 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1990 InGlue = Chain.getValue(1); 1991 } 1992 // Op to just print "call" 1993 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1994 SDValue PrintCallOps[] = { 1995 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 1996 }; 1997 // We model convergent calls as separate opcodes. 1998 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1999 if (CLI.IsConvergent) 2000 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 2001 : NVPTXISD::PrintConvergentCall; 2002 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 2003 InGlue = Chain.getValue(1); 2004 2005 // Ops to print out the function name 2006 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2007 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 2008 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 2009 InGlue = Chain.getValue(1); 2010 2011 // Ops to print out the param list 2012 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2013 SDValue CallArgBeginOps[] = { Chain, InGlue }; 2014 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 2015 CallArgBeginOps); 2016 InGlue = Chain.getValue(1); 2017 2018 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 2019 ++i) { 2020 unsigned opcode; 2021 if (i == (e - 1)) 2022 opcode = NVPTXISD::LastCallArg; 2023 else 2024 opcode = NVPTXISD::CallArg; 2025 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2026 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 2027 DAG.getConstant(i, dl, MVT::i32), InGlue }; 2028 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 2029 InGlue = Chain.getValue(1); 2030 } 2031 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2032 SDValue CallArgEndOps[] = { Chain, 2033 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 2034 InGlue }; 2035 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 2036 InGlue = Chain.getValue(1); 2037 2038 if (isIndirectCall) { 2039 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2040 SDValue PrototypeOps[] = { 2041 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 2042 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 2043 InGlue = Chain.getValue(1); 2044 } 2045 2046 SmallVector<SDValue, 16> ProxyRegOps; 2047 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 2048 // An item of the vector is filled if the element does not need a ProxyReg 2049 // operation on it and should be added to InVals as is. ProxyRegOps and 2050 // ProxyRegTruncates contain empty/none items at the same index. 2051 SmallVector<SDValue, 16> RetElts; 2052 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` 2053 // to use the values of `LoadParam`s and to be replaced later then 2054 // `CALLSEQ_END` is added. 2055 SmallVector<SDValue, 16> TempProxyRegOps; 2056 2057 // Generate loads from param memory/moves from registers for result 2058 if (Ins.size() > 0) { 2059 SmallVector<EVT, 16> VTs; 2060 SmallVector<uint64_t, 16> Offsets; 2061 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 2062 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 2063 2064 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); 2065 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 2066 2067 SmallVector<EVT, 6> LoadVTs; 2068 int VecIdx = -1; // Index of the first element of the vector. 2069 2070 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2071 // 32-bits are sign extended or zero extended, depending on whether 2072 // they are signed or unsigned types. 2073 bool ExtendIntegerRetVal = 2074 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2075 2076 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2077 bool needTruncate = false; 2078 EVT TheLoadType = VTs[i]; 2079 EVT EltType = Ins[i].VT; 2080 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 2081 MVT PromotedVT; 2082 2083 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 2084 TheLoadType = EVT(PromotedVT); 2085 EltType = EVT(PromotedVT); 2086 needTruncate = true; 2087 } 2088 2089 if (ExtendIntegerRetVal) { 2090 TheLoadType = MVT::i32; 2091 EltType = MVT::i32; 2092 needTruncate = true; 2093 } else if (TheLoadType.getSizeInBits() < 16) { 2094 if (VTs[i].isInteger()) 2095 needTruncate = true; 2096 EltType = MVT::i16; 2097 } 2098 2099 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 2100 // scalar load. In such cases, fall back to byte loads. 2101 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && 2102 EltAlign < DL.getABITypeAlign( 2103 TheLoadType.getTypeForEVT(*DAG.getContext()))) { 2104 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2105 SDValue Ret = LowerUnalignedLoadRetParam( 2106 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); 2107 ProxyRegOps.push_back(SDValue()); 2108 ProxyRegTruncates.push_back(std::optional<MVT>()); 2109 RetElts.resize(i); 2110 RetElts.push_back(Ret); 2111 2112 continue; 2113 } 2114 2115 // Record index of the very first element of the vector. 2116 if (VectorInfo[i] & PVF_FIRST) { 2117 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2118 VecIdx = i; 2119 } 2120 2121 LoadVTs.push_back(EltType); 2122 2123 if (VectorInfo[i] & PVF_LAST) { 2124 unsigned NumElts = LoadVTs.size(); 2125 LoadVTs.push_back(MVT::Other); 2126 LoadVTs.push_back(MVT::Glue); 2127 NVPTXISD::NodeType Op; 2128 switch (NumElts) { 2129 case 1: 2130 Op = NVPTXISD::LoadParam; 2131 break; 2132 case 2: 2133 Op = NVPTXISD::LoadParamV2; 2134 break; 2135 case 4: 2136 Op = NVPTXISD::LoadParamV4; 2137 break; 2138 default: 2139 llvm_unreachable("Invalid vector info."); 2140 } 2141 2142 SDValue LoadOperands[] = { 2143 Chain, DAG.getConstant(1, dl, MVT::i32), 2144 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2145 SDValue RetVal = DAG.getMemIntrinsicNode( 2146 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2147 MachinePointerInfo(), EltAlign, 2148 MachineMemOperand::MOLoad); 2149 2150 for (unsigned j = 0; j < NumElts; ++j) { 2151 ProxyRegOps.push_back(RetVal.getValue(j)); 2152 2153 if (needTruncate) 2154 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2155 else 2156 ProxyRegTruncates.push_back(std::optional<MVT>()); 2157 } 2158 2159 Chain = RetVal.getValue(NumElts); 2160 InGlue = RetVal.getValue(NumElts + 1); 2161 2162 // Cleanup 2163 VecIdx = -1; 2164 LoadVTs.clear(); 2165 } 2166 } 2167 } 2168 2169 Chain = 2170 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2171 InGlue = Chain.getValue(1); 2172 2173 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2174 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2175 // dangling. 2176 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2177 if (i < RetElts.size() && RetElts[i]) { 2178 InVals.push_back(RetElts[i]); 2179 continue; 2180 } 2181 2182 SDValue Ret = DAG.getNode( 2183 NVPTXISD::ProxyReg, dl, 2184 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2185 { Chain, ProxyRegOps[i], InGlue } 2186 ); 2187 2188 Chain = Ret.getValue(1); 2189 InGlue = Ret.getValue(2); 2190 2191 if (ProxyRegTruncates[i]) { 2192 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2193 } 2194 2195 InVals.push_back(Ret); 2196 } 2197 2198 for (SDValue &T : TempProxyRegOps) { 2199 SDValue Repl = DAG.getNode( 2200 NVPTXISD::ProxyReg, dl, 2201 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), 2202 {Chain, T.getOperand(0), InGlue}); 2203 DAG.ReplaceAllUsesWith(T, Repl); 2204 DAG.RemoveDeadNode(T.getNode()); 2205 2206 Chain = Repl.getValue(1); 2207 InGlue = Repl.getValue(2); 2208 } 2209 2210 // set isTailCall to false for now, until we figure out how to express 2211 // tail call optimization in PTX 2212 isTailCall = false; 2213 return Chain; 2214 } 2215 2216 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 2217 SelectionDAG &DAG) const { 2218 2219 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { 2220 const Function &Fn = DAG.getMachineFunction().getFunction(); 2221 2222 DiagnosticInfoUnsupported NoDynamicAlloca( 2223 Fn, 2224 "Support for dynamic alloca introduced in PTX ISA version 7.3 and " 2225 "requires target sm_52.", 2226 SDLoc(Op).getDebugLoc()); 2227 DAG.getContext()->diagnose(NoDynamicAlloca); 2228 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), 2229 Op.getOperand(0)}; 2230 return DAG.getMergeValues(Ops, SDLoc()); 2231 } 2232 2233 SDValue Chain = Op.getOperand(0); 2234 SDValue Size = Op.getOperand(1); 2235 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 2236 SDLoc DL(Op.getNode()); 2237 2238 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. 2239 if (nvTM->is64Bit()) 2240 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64); 2241 else 2242 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32); 2243 2244 SDValue AllocOps[] = {Chain, Size, 2245 DAG.getTargetConstant(Align, DL, MVT::i32)}; 2246 SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, 2247 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps); 2248 2249 SDValue MergeOps[] = {Alloca, Chain}; 2250 return DAG.getMergeValues(MergeOps, DL); 2251 } 2252 2253 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2254 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2255 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2256 SDValue 2257 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2258 SDNode *Node = Op.getNode(); 2259 SDLoc dl(Node); 2260 SmallVector<SDValue, 8> Ops; 2261 unsigned NumOperands = Node->getNumOperands(); 2262 for (unsigned i = 0; i < NumOperands; ++i) { 2263 SDValue SubOp = Node->getOperand(i); 2264 EVT VVT = SubOp.getNode()->getValueType(0); 2265 EVT EltVT = VVT.getVectorElementType(); 2266 unsigned NumSubElem = VVT.getVectorNumElements(); 2267 for (unsigned j = 0; j < NumSubElem; ++j) { 2268 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2269 DAG.getIntPtrConstant(j, dl))); 2270 } 2271 } 2272 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2273 } 2274 2275 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2276 // would get lowered as two constant loads and vector-packing move. 2277 // Instead we want just a constant move: 2278 // mov.b32 %r2, 0x40003C00 2279 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2280 SelectionDAG &DAG) const { 2281 EVT VT = Op->getValueType(0); 2282 if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) 2283 return Op; 2284 2285 SDLoc DL(Op); 2286 2287 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { 2288 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || 2289 isa<ConstantFPSDNode>(Operand); 2290 })) { 2291 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2292 // to optimize calculation of constant parts. 2293 if (VT == MVT::v4i8) { 2294 SDValue C8 = DAG.getConstant(8, DL, MVT::i32); 2295 SDValue E01 = DAG.getNode( 2296 NVPTXISD::BFI, DL, MVT::i32, 2297 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), 2298 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); 2299 SDValue E012 = 2300 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2301 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), 2302 E01, DAG.getConstant(16, DL, MVT::i32), C8); 2303 SDValue E0123 = 2304 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2305 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), 2306 E012, DAG.getConstant(24, DL, MVT::i32), C8); 2307 return DAG.getNode(ISD::BITCAST, DL, VT, E0123); 2308 } 2309 return Op; 2310 } 2311 2312 // Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2313 auto GetOperand = [](SDValue Op, int N) -> APInt { 2314 const SDValue &Operand = Op->getOperand(N); 2315 EVT VT = Op->getValueType(0); 2316 if (Operand->isUndef()) 2317 return APInt(32, 0); 2318 APInt Value; 2319 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2320 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); 2321 else if (VT == MVT::v2i16 || VT == MVT::v4i8) 2322 Value = Operand->getAsAPIntVal(); 2323 else 2324 llvm_unreachable("Unsupported type"); 2325 // i8 values are carried around as i16, so we need to zero out upper bits, 2326 // so they do not get in the way of combining individual byte values 2327 if (VT == MVT::v4i8) 2328 Value = Value.trunc(8); 2329 return Value.zext(32); 2330 }; 2331 APInt Value; 2332 if (Isv2x16VT(VT)) { 2333 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); 2334 } else if (VT == MVT::v4i8) { 2335 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | 2336 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); 2337 } else { 2338 llvm_unreachable("Unsupported type"); 2339 } 2340 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); 2341 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2342 } 2343 2344 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2345 SelectionDAG &DAG) const { 2346 SDValue Index = Op->getOperand(1); 2347 SDValue Vector = Op->getOperand(0); 2348 SDLoc DL(Op); 2349 EVT VectorVT = Vector.getValueType(); 2350 2351 if (VectorVT == MVT::v4i8) { 2352 SDValue BFE = 2353 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, 2354 {Vector, 2355 DAG.getNode(ISD::MUL, DL, MVT::i32, 2356 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2357 DAG.getConstant(8, DL, MVT::i32)), 2358 DAG.getConstant(8, DL, MVT::i32)}); 2359 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); 2360 } 2361 2362 // Constant index will be matched by tablegen. 2363 if (isa<ConstantSDNode>(Index.getNode())) 2364 return Op; 2365 2366 // Extract individual elements and select one of them. 2367 assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); 2368 EVT EltVT = VectorVT.getVectorElementType(); 2369 2370 SDLoc dl(Op.getNode()); 2371 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2372 DAG.getIntPtrConstant(0, dl)); 2373 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2374 DAG.getIntPtrConstant(1, dl)); 2375 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2376 ISD::CondCode::SETEQ); 2377 } 2378 2379 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 2380 SelectionDAG &DAG) const { 2381 SDValue Vector = Op->getOperand(0); 2382 EVT VectorVT = Vector.getValueType(); 2383 2384 if (VectorVT != MVT::v4i8) 2385 return Op; 2386 SDLoc DL(Op); 2387 SDValue Value = Op->getOperand(1); 2388 if (Value->isUndef()) 2389 return Vector; 2390 2391 SDValue Index = Op->getOperand(2); 2392 2393 SDValue BFI = 2394 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2395 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, 2396 DAG.getNode(ISD::MUL, DL, MVT::i32, 2397 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2398 DAG.getConstant(8, DL, MVT::i32)), 2399 DAG.getConstant(8, DL, MVT::i32)}); 2400 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); 2401 } 2402 2403 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 2404 SelectionDAG &DAG) const { 2405 SDValue V1 = Op.getOperand(0); 2406 EVT VectorVT = V1.getValueType(); 2407 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) 2408 return Op; 2409 2410 // Lower shuffle to PRMT instruction. 2411 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 2412 SDValue V2 = Op.getOperand(1); 2413 uint32_t Selector = 0; 2414 for (auto I : llvm::enumerate(SVN->getMask())) { 2415 if (I.value() != -1) // -1 is a placeholder for undef. 2416 Selector |= (I.value() << (I.index() * 4)); 2417 } 2418 2419 SDLoc DL(Op); 2420 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, 2421 DAG.getConstant(Selector, DL, MVT::i32), 2422 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); 2423 } 2424 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2425 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2426 /// amount, or 2427 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2428 /// amount. 2429 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2430 SelectionDAG &DAG) const { 2431 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2432 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2433 2434 EVT VT = Op.getValueType(); 2435 unsigned VTBits = VT.getSizeInBits(); 2436 SDLoc dl(Op); 2437 SDValue ShOpLo = Op.getOperand(0); 2438 SDValue ShOpHi = Op.getOperand(1); 2439 SDValue ShAmt = Op.getOperand(2); 2440 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2441 2442 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2443 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2444 // {dHi, dLo} = {aHi, aLo} >> Amt 2445 // dHi = aHi >> Amt 2446 // dLo = shf.r.clamp aLo, aHi, Amt 2447 2448 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2449 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2450 ShAmt); 2451 2452 SDValue Ops[2] = { Lo, Hi }; 2453 return DAG.getMergeValues(Ops, dl); 2454 } 2455 else { 2456 // {dHi, dLo} = {aHi, aLo} >> Amt 2457 // - if (Amt>=size) then 2458 // dLo = aHi >> (Amt-size) 2459 // dHi = aHi >> Amt (this is either all 0 or all 1) 2460 // else 2461 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2462 // dHi = aHi >> Amt 2463 2464 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2465 DAG.getConstant(VTBits, dl, MVT::i32), 2466 ShAmt); 2467 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2468 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2469 DAG.getConstant(VTBits, dl, MVT::i32)); 2470 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2471 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2472 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2473 2474 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2475 DAG.getConstant(VTBits, dl, MVT::i32), 2476 ISD::SETGE); 2477 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2478 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2479 2480 SDValue Ops[2] = { Lo, Hi }; 2481 return DAG.getMergeValues(Ops, dl); 2482 } 2483 } 2484 2485 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2486 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2487 /// amount, or 2488 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2489 /// amount. 2490 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2491 SelectionDAG &DAG) const { 2492 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2493 assert(Op.getOpcode() == ISD::SHL_PARTS); 2494 2495 EVT VT = Op.getValueType(); 2496 unsigned VTBits = VT.getSizeInBits(); 2497 SDLoc dl(Op); 2498 SDValue ShOpLo = Op.getOperand(0); 2499 SDValue ShOpHi = Op.getOperand(1); 2500 SDValue ShAmt = Op.getOperand(2); 2501 2502 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2503 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2504 // {dHi, dLo} = {aHi, aLo} << Amt 2505 // dHi = shf.l.clamp aLo, aHi, Amt 2506 // dLo = aLo << Amt 2507 2508 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2509 ShAmt); 2510 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2511 2512 SDValue Ops[2] = { Lo, Hi }; 2513 return DAG.getMergeValues(Ops, dl); 2514 } 2515 else { 2516 // {dHi, dLo} = {aHi, aLo} << Amt 2517 // - if (Amt>=size) then 2518 // dLo = aLo << Amt (all 0) 2519 // dLo = aLo << (Amt-size) 2520 // else 2521 // dLo = aLo << Amt 2522 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2523 2524 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2525 DAG.getConstant(VTBits, dl, MVT::i32), 2526 ShAmt); 2527 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2528 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2529 DAG.getConstant(VTBits, dl, MVT::i32)); 2530 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2531 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2532 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2533 2534 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2535 DAG.getConstant(VTBits, dl, MVT::i32), 2536 ISD::SETGE); 2537 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2538 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2539 2540 SDValue Ops[2] = { Lo, Hi }; 2541 return DAG.getMergeValues(Ops, dl); 2542 } 2543 } 2544 2545 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2546 EVT VT = Op.getValueType(); 2547 2548 if (VT == MVT::f32) 2549 return LowerFROUND32(Op, DAG); 2550 2551 if (VT == MVT::f64) 2552 return LowerFROUND64(Op, DAG); 2553 2554 llvm_unreachable("unhandled type"); 2555 } 2556 2557 // This is the the rounding method used in CUDA libdevice in C like code: 2558 // float roundf(float A) 2559 // { 2560 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2561 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2562 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2563 // } 2564 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2565 SelectionDAG &DAG) const { 2566 SDLoc SL(Op); 2567 SDValue A = Op.getOperand(0); 2568 EVT VT = Op.getValueType(); 2569 2570 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2571 2572 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2573 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2574 const int SignBitMask = 0x80000000; 2575 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2576 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2577 const int PointFiveInBits = 0x3F000000; 2578 SDValue PointFiveWithSignRaw = 2579 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2580 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2581 SDValue PointFiveWithSign = 2582 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2583 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2584 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2585 2586 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2587 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2588 SDValue IsLarge = 2589 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2590 ISD::SETOGT); 2591 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2592 2593 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2594 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2595 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2596 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2597 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2598 } 2599 2600 // The implementation of round(double) is similar to that of round(float) in 2601 // that they both separate the value range into three regions and use a method 2602 // specific to the region to round the values. However, round(double) first 2603 // calculates the round of the absolute value and then adds the sign back while 2604 // round(float) directly rounds the value with sign. 2605 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2606 SelectionDAG &DAG) const { 2607 SDLoc SL(Op); 2608 SDValue A = Op.getOperand(0); 2609 EVT VT = Op.getValueType(); 2610 2611 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2612 2613 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2614 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2615 DAG.getConstantFP(0.5, SL, VT)); 2616 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2617 2618 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2619 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2620 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2621 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2622 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2623 DAG.getConstantFP(0, SL, VT), 2624 RoundedA); 2625 2626 // Add sign to rounded_A 2627 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2628 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2629 2630 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2631 SDValue IsLarge = 2632 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2633 ISD::SETOGT); 2634 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2635 } 2636 2637 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, 2638 SelectionDAG &DAG) const { 2639 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2640 2641 if (Op.getValueType() == MVT::bf16) { 2642 SDLoc Loc(Op); 2643 return DAG.getNode( 2644 ISD::FP_ROUND, Loc, MVT::bf16, 2645 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), 2646 DAG.getIntPtrConstant(0, Loc)); 2647 } 2648 2649 // Everything else is considered legal. 2650 return Op; 2651 } 2652 2653 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, 2654 SelectionDAG &DAG) const { 2655 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2656 2657 if (Op.getOperand(0).getValueType() == MVT::bf16) { 2658 SDLoc Loc(Op); 2659 return DAG.getNode( 2660 Op.getOpcode(), Loc, Op.getValueType(), 2661 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); 2662 } 2663 2664 // Everything else is considered legal. 2665 return Op; 2666 } 2667 2668 SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op, 2669 SelectionDAG &DAG) const { 2670 EVT NarrowVT = Op.getValueType(); 2671 SDValue Wide = Op.getOperand(0); 2672 EVT WideVT = Wide.getValueType(); 2673 if (NarrowVT.getScalarType() == MVT::bf16) { 2674 const TargetLowering *TLI = STI.getTargetLowering(); 2675 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) { 2676 return TLI->expandFP_ROUND(Op.getNode(), DAG); 2677 } 2678 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 2679 // This combination was the first to support f32 -> bf16. 2680 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) { 2681 if (WideVT.getScalarType() == MVT::f32) { 2682 return Op; 2683 } 2684 if (WideVT.getScalarType() == MVT::f64) { 2685 SDLoc Loc(Op); 2686 // Round-inexact-to-odd f64 to f32, then do the final rounding using 2687 // the hardware f32 -> bf16 instruction. 2688 SDValue rod = TLI->expandRoundInexactToOdd( 2689 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32) 2690 : MVT::f32, 2691 Wide, Loc, DAG); 2692 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT); 2693 } 2694 } 2695 return TLI->expandFP_ROUND(Op.getNode(), DAG); 2696 } 2697 } 2698 2699 // Everything else is considered legal. 2700 return Op; 2701 } 2702 2703 SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, 2704 SelectionDAG &DAG) const { 2705 SDValue Narrow = Op.getOperand(0); 2706 EVT NarrowVT = Narrow.getValueType(); 2707 EVT WideVT = Op.getValueType(); 2708 if (NarrowVT.getScalarType() == MVT::bf16) { 2709 if (WideVT.getScalarType() == MVT::f32 && 2710 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) { 2711 SDLoc Loc(Op); 2712 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow); 2713 } 2714 if (WideVT.getScalarType() == MVT::f64 && 2715 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { 2716 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32) 2717 : MVT::f32; 2718 SDLoc Loc(Op); 2719 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { 2720 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow); 2721 } else { 2722 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow); 2723 } 2724 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op); 2725 } 2726 } 2727 2728 // Everything else is considered legal. 2729 return Op; 2730 } 2731 2732 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { 2733 SDLoc DL(Op); 2734 if (Op.getValueType() != MVT::v2i16) 2735 return Op; 2736 EVT EltVT = Op.getValueType().getVectorElementType(); 2737 SmallVector<SDValue> VecElements; 2738 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { 2739 SmallVector<SDValue> ScalarArgs; 2740 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), 2741 [&](const SDUse &O) { 2742 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 2743 O.get(), DAG.getIntPtrConstant(I, DL)); 2744 }); 2745 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); 2746 } 2747 SDValue V = 2748 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); 2749 return V; 2750 } 2751 2752 SDValue 2753 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2754 switch (Op.getOpcode()) { 2755 case ISD::RETURNADDR: 2756 return SDValue(); 2757 case ISD::FRAMEADDR: 2758 return SDValue(); 2759 case ISD::GlobalAddress: 2760 return LowerGlobalAddress(Op, DAG); 2761 case ISD::INTRINSIC_W_CHAIN: 2762 return Op; 2763 case ISD::BUILD_VECTOR: 2764 return LowerBUILD_VECTOR(Op, DAG); 2765 case ISD::EXTRACT_SUBVECTOR: 2766 return Op; 2767 case ISD::EXTRACT_VECTOR_ELT: 2768 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2769 case ISD::INSERT_VECTOR_ELT: 2770 return LowerINSERT_VECTOR_ELT(Op, DAG); 2771 case ISD::VECTOR_SHUFFLE: 2772 return LowerVECTOR_SHUFFLE(Op, DAG); 2773 case ISD::CONCAT_VECTORS: 2774 return LowerCONCAT_VECTORS(Op, DAG); 2775 case ISD::STORE: 2776 return LowerSTORE(Op, DAG); 2777 case ISD::LOAD: 2778 return LowerLOAD(Op, DAG); 2779 case ISD::SHL_PARTS: 2780 return LowerShiftLeftParts(Op, DAG); 2781 case ISD::SRA_PARTS: 2782 case ISD::SRL_PARTS: 2783 return LowerShiftRightParts(Op, DAG); 2784 case ISD::SELECT: 2785 return LowerSelect(Op, DAG); 2786 case ISD::FROUND: 2787 return LowerFROUND(Op, DAG); 2788 case ISD::SINT_TO_FP: 2789 case ISD::UINT_TO_FP: 2790 return LowerINT_TO_FP(Op, DAG); 2791 case ISD::FP_TO_SINT: 2792 case ISD::FP_TO_UINT: 2793 return LowerFP_TO_INT(Op, DAG); 2794 case ISD::FP_ROUND: 2795 return LowerFP_ROUND(Op, DAG); 2796 case ISD::FP_EXTEND: 2797 return LowerFP_EXTEND(Op, DAG); 2798 case ISD::VAARG: 2799 return LowerVAARG(Op, DAG); 2800 case ISD::VASTART: 2801 return LowerVASTART(Op, DAG); 2802 case ISD::ABS: 2803 case ISD::SMIN: 2804 case ISD::SMAX: 2805 case ISD::UMIN: 2806 case ISD::UMAX: 2807 case ISD::ADD: 2808 case ISD::SUB: 2809 case ISD::MUL: 2810 case ISD::SHL: 2811 case ISD::SREM: 2812 case ISD::UREM: 2813 return LowerVectorArith(Op, DAG); 2814 case ISD::DYNAMIC_STACKALLOC: 2815 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2816 case ISD::CopyToReg: 2817 return LowerCopyToReg_128(Op, DAG); 2818 default: 2819 llvm_unreachable("Custom lowering not defined for operation"); 2820 } 2821 } 2822 2823 // This function is almost a copy of SelectionDAG::expandVAArg(). 2824 // The only diff is that this one produces loads from local address space. 2825 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2826 const TargetLowering *TLI = STI.getTargetLowering(); 2827 SDLoc DL(Op); 2828 2829 SDNode *Node = Op.getNode(); 2830 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2831 EVT VT = Node->getValueType(0); 2832 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2833 SDValue Tmp1 = Node->getOperand(0); 2834 SDValue Tmp2 = Node->getOperand(1); 2835 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2836 2837 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2838 Tmp1, Tmp2, MachinePointerInfo(V)); 2839 SDValue VAList = VAListLoad; 2840 2841 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2842 VAList = DAG.getNode( 2843 ISD::ADD, DL, VAList.getValueType(), VAList, 2844 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2845 2846 VAList = DAG.getNode( 2847 ISD::AND, DL, VAList.getValueType(), VAList, 2848 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2849 } 2850 2851 // Increment the pointer, VAList, to the next vaarg 2852 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2853 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2854 DL, VAList.getValueType())); 2855 2856 // Store the incremented VAList to the legalized pointer 2857 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2858 MachinePointerInfo(V)); 2859 2860 const Value *SrcV = 2861 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2862 2863 // Load the actual argument out of the pointer VAList 2864 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2865 } 2866 2867 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2868 const TargetLowering *TLI = STI.getTargetLowering(); 2869 SDLoc DL(Op); 2870 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2871 2872 // Store the address of unsized array <function>_vararg[] in the ap object. 2873 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2874 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2875 2876 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2877 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2878 MachinePointerInfo(SV)); 2879 } 2880 2881 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2882 SDValue Op0 = Op->getOperand(0); 2883 SDValue Op1 = Op->getOperand(1); 2884 SDValue Op2 = Op->getOperand(2); 2885 SDLoc DL(Op.getNode()); 2886 2887 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2888 2889 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2890 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2891 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2892 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2893 2894 return Trunc; 2895 } 2896 2897 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2898 if (Op.getValueType() == MVT::i1) 2899 return LowerLOADi1(Op, DAG); 2900 2901 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2902 // unaligned loads and have to handle it here. 2903 EVT VT = Op.getValueType(); 2904 if (Isv2x16VT(VT) || VT == MVT::v4i8) { 2905 LoadSDNode *Load = cast<LoadSDNode>(Op); 2906 EVT MemVT = Load->getMemoryVT(); 2907 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2908 MemVT, *Load->getMemOperand())) { 2909 SDValue Ops[2]; 2910 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2911 return DAG.getMergeValues(Ops, SDLoc(Op)); 2912 } 2913 } 2914 2915 return SDValue(); 2916 } 2917 2918 // v = ld i1* addr 2919 // => 2920 // v1 = ld i8* addr (-> i16) 2921 // v = trunc i16 to i1 2922 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2923 SDNode *Node = Op.getNode(); 2924 LoadSDNode *LD = cast<LoadSDNode>(Node); 2925 SDLoc dl(Node); 2926 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2927 assert(Node->getValueType(0) == MVT::i1 && 2928 "Custom lowering for i1 load only"); 2929 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), 2930 LD->getBasePtr(), LD->getPointerInfo(), 2931 MVT::i8, LD->getAlign(), 2932 LD->getMemOperand()->getFlags()); 2933 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2934 // The legalizer (the caller) is expecting two values from the legalized 2935 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2936 // in LegalizeDAG.cpp which also uses MergeValues. 2937 SDValue Ops[] = { result, LD->getChain() }; 2938 return DAG.getMergeValues(Ops, dl); 2939 } 2940 2941 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2942 StoreSDNode *Store = cast<StoreSDNode>(Op); 2943 EVT VT = Store->getMemoryVT(); 2944 2945 if (VT == MVT::i1) 2946 return LowerSTOREi1(Op, DAG); 2947 2948 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2949 // stores and have to handle it here. 2950 if ((Isv2x16VT(VT) || VT == MVT::v4i8) && 2951 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2952 VT, *Store->getMemOperand())) 2953 return expandUnalignedStore(Store, DAG); 2954 2955 // v2f16, v2bf16 and v2i16 don't need special handling. 2956 if (Isv2x16VT(VT) || VT == MVT::v4i8) 2957 return SDValue(); 2958 2959 if (VT.isVector()) 2960 return LowerSTOREVector(Op, DAG); 2961 2962 return SDValue(); 2963 } 2964 2965 SDValue 2966 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2967 SDNode *N = Op.getNode(); 2968 SDValue Val = N->getOperand(1); 2969 SDLoc DL(N); 2970 EVT ValVT = Val.getValueType(); 2971 2972 if (ValVT.isVector()) { 2973 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2974 // legal. We can (and should) split that into 2 stores of <2 x double> here 2975 // but I'm leaving that as a TODO for now. 2976 if (!ValVT.isSimple()) 2977 return SDValue(); 2978 switch (ValVT.getSimpleVT().SimpleTy) { 2979 default: 2980 return SDValue(); 2981 case MVT::v2i8: 2982 case MVT::v2i16: 2983 case MVT::v2i32: 2984 case MVT::v2i64: 2985 case MVT::v2f16: 2986 case MVT::v2bf16: 2987 case MVT::v2f32: 2988 case MVT::v2f64: 2989 case MVT::v4i8: 2990 case MVT::v4i16: 2991 case MVT::v4i32: 2992 case MVT::v4f16: 2993 case MVT::v4bf16: 2994 case MVT::v4f32: 2995 case MVT::v8f16: // <4 x f16x2> 2996 case MVT::v8bf16: // <4 x bf16x2> 2997 case MVT::v8i16: // <4 x i16x2> 2998 // This is a "native" vector type 2999 break; 3000 } 3001 3002 MemSDNode *MemSD = cast<MemSDNode>(N); 3003 const DataLayout &TD = DAG.getDataLayout(); 3004 3005 Align Alignment = MemSD->getAlign(); 3006 Align PrefAlign = 3007 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 3008 if (Alignment < PrefAlign) { 3009 // This store is not sufficiently aligned, so bail out and let this vector 3010 // store be scalarized. Note that we may still be able to emit smaller 3011 // vector stores. For example, if we are storing a <4 x float> with an 3012 // alignment of 8, this check will fail but the legalizer will try again 3013 // with 2 x <2 x float>, which will succeed with an alignment of 8. 3014 return SDValue(); 3015 } 3016 3017 unsigned Opcode = 0; 3018 EVT EltVT = ValVT.getVectorElementType(); 3019 unsigned NumElts = ValVT.getVectorNumElements(); 3020 3021 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 3022 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 3023 // stored type to i16 and propagate the "real" type as the memory type. 3024 bool NeedExt = false; 3025 if (EltVT.getSizeInBits() < 16) 3026 NeedExt = true; 3027 3028 bool StoreF16x2 = false; 3029 switch (NumElts) { 3030 default: 3031 return SDValue(); 3032 case 2: 3033 Opcode = NVPTXISD::StoreV2; 3034 break; 3035 case 4: 3036 Opcode = NVPTXISD::StoreV4; 3037 break; 3038 case 8: 3039 // v8f16 is a special case. PTX doesn't have st.v8.f16 3040 // instruction. Instead, we split the vector into v2f16 chunks and 3041 // store them with st.v4.b32. 3042 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); 3043 Opcode = NVPTXISD::StoreV4; 3044 StoreF16x2 = true; 3045 break; 3046 } 3047 3048 SmallVector<SDValue, 8> Ops; 3049 3050 // First is the chain 3051 Ops.push_back(N->getOperand(0)); 3052 3053 if (StoreF16x2) { 3054 // Combine f16,f16 -> v2f16 3055 NumElts /= 2; 3056 for (unsigned i = 0; i < NumElts; ++i) { 3057 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3058 DAG.getIntPtrConstant(i * 2, DL)); 3059 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3060 DAG.getIntPtrConstant(i * 2 + 1, DL)); 3061 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 3062 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 3063 Ops.push_back(V2); 3064 } 3065 } else { 3066 // Then the split values 3067 for (unsigned i = 0; i < NumElts; ++i) { 3068 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3069 DAG.getIntPtrConstant(i, DL)); 3070 if (NeedExt) 3071 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 3072 Ops.push_back(ExtVal); 3073 } 3074 } 3075 3076 // Then any remaining arguments 3077 Ops.append(N->op_begin() + 2, N->op_end()); 3078 3079 SDValue NewSt = 3080 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 3081 MemSD->getMemoryVT(), MemSD->getMemOperand()); 3082 3083 // return DCI.CombineTo(N, NewSt, true); 3084 return NewSt; 3085 } 3086 3087 return SDValue(); 3088 } 3089 3090 // st i1 v, addr 3091 // => 3092 // v1 = zxt v to i16 3093 // st.u8 i16, addr 3094 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 3095 SDNode *Node = Op.getNode(); 3096 SDLoc dl(Node); 3097 StoreSDNode *ST = cast<StoreSDNode>(Node); 3098 SDValue Tmp1 = ST->getChain(); 3099 SDValue Tmp2 = ST->getBasePtr(); 3100 SDValue Tmp3 = ST->getValue(); 3101 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 3102 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 3103 SDValue Result = 3104 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 3105 ST->getAlign(), ST->getMemOperand()->getFlags()); 3106 return Result; 3107 } 3108 3109 SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, 3110 SelectionDAG &DAG) const { 3111 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit 3112 // operand so that it can pass the legalization. 3113 3114 assert(Op.getOperand(1).getValueType() == MVT::i128 && 3115 "Custom lowering for 128-bit CopyToReg only"); 3116 3117 SDNode *Node = Op.getNode(); 3118 SDLoc DL(Node); 3119 3120 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2)); 3121 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, 3122 DAG.getIntPtrConstant(0, DL)); 3123 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, 3124 DAG.getIntPtrConstant(1, DL)); 3125 3126 SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1); 3127 SmallVector<EVT, 3> ResultsType(Node->values()); 3128 3129 NewOps[0] = Op->getOperand(0); // Chain 3130 NewOps[1] = Op->getOperand(1); // Dst Reg 3131 NewOps[2] = Lo; // Lower 64-bit 3132 NewOps[3] = Hi; // Higher 64-bit 3133 if (Op.getNumOperands() == 4) 3134 NewOps[4] = Op->getOperand(3); // Glue if exists 3135 3136 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps); 3137 } 3138 3139 unsigned NVPTXTargetLowering::getNumRegisters( 3140 LLVMContext &Context, EVT VT, 3141 std::optional<MVT> RegisterVT = std::nullopt) const { 3142 if (VT == MVT::i128 && RegisterVT == MVT::i128) 3143 return 1; 3144 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); 3145 } 3146 3147 bool NVPTXTargetLowering::splitValueIntoRegisterParts( 3148 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 3149 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { 3150 if (Val.getValueType() == MVT::i128 && NumParts == 1) { 3151 Parts[0] = Val; 3152 return true; 3153 } 3154 return false; 3155 } 3156 3157 // This creates target external symbol for a function parameter. 3158 // Name of the symbol is composed from its index and the function name. 3159 // Negative index corresponds to special parameter (unsized array) used for 3160 // passing variable arguments. 3161 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 3162 EVT v) const { 3163 StringRef SavedStr = nvTM->getStrPool().save( 3164 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 3165 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 3166 } 3167 3168 SDValue NVPTXTargetLowering::LowerFormalArguments( 3169 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3170 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3171 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3172 MachineFunction &MF = DAG.getMachineFunction(); 3173 const DataLayout &DL = DAG.getDataLayout(); 3174 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3175 3176 const Function *F = &MF.getFunction(); 3177 const AttributeList &PAL = F->getAttributes(); 3178 const TargetLowering *TLI = STI.getTargetLowering(); 3179 3180 SDValue Root = DAG.getRoot(); 3181 std::vector<SDValue> OutChains; 3182 3183 bool isABI = (STI.getSmVersion() >= 20); 3184 assert(isABI && "Non-ABI compilation is not supported"); 3185 if (!isABI) 3186 return Chain; 3187 3188 std::vector<Type *> argTypes; 3189 std::vector<const Argument *> theArgs; 3190 for (const Argument &I : F->args()) { 3191 theArgs.push_back(&I); 3192 argTypes.push_back(I.getType()); 3193 } 3194 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3195 // Ins.size() will be larger 3196 // * if there is an aggregate argument with multiple fields (each field 3197 // showing up separately in Ins) 3198 // * if there is a vector argument with more than typical vector-length 3199 // elements (generally if more than 4) where each vector element is 3200 // individually present in Ins. 3201 // So a different index should be used for indexing into Ins. 3202 // See similar issue in LowerCall. 3203 unsigned InsIdx = 0; 3204 3205 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) { 3206 Type *Ty = argTypes[i]; 3207 3208 if (theArgs[i]->use_empty()) { 3209 // argument is dead 3210 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 3211 SmallVector<EVT, 16> vtparts; 3212 3213 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 3214 if (vtparts.empty()) 3215 report_fatal_error("Empty parameter types are not supported"); 3216 3217 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 3218 ++parti) { 3219 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3220 ++InsIdx; 3221 } 3222 if (vtparts.size() > 0) 3223 --InsIdx; 3224 continue; 3225 } 3226 if (Ty->isVectorTy()) { 3227 EVT ObjectVT = getValueType(DL, Ty); 3228 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 3229 for (unsigned parti = 0; parti < NumRegs; ++parti) { 3230 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3231 ++InsIdx; 3232 } 3233 if (NumRegs > 0) 3234 --InsIdx; 3235 continue; 3236 } 3237 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3238 continue; 3239 } 3240 3241 // In the following cases, assign a node order of "i+1" 3242 // to newly created nodes. The SDNodes for params have to 3243 // appear in the same order as their order of appearance 3244 // in the original function. "i+1" holds that order. 3245 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 3246 bool aggregateIsPacked = false; 3247 if (StructType *STy = dyn_cast<StructType>(Ty)) 3248 aggregateIsPacked = STy->isPacked(); 3249 3250 SmallVector<EVT, 16> VTs; 3251 SmallVector<uint64_t, 16> Offsets; 3252 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 3253 if (VTs.empty()) 3254 report_fatal_error("Empty parameter types are not supported"); 3255 3256 Align ArgAlign = getFunctionArgumentAlignment( 3257 F, Ty, i + AttributeList::FirstArgIndex, DL); 3258 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 3259 3260 SDValue Arg = getParamSymbol(DAG, i, PtrVT); 3261 int VecIdx = -1; // Index of the first element of the current vector. 3262 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 3263 if (VectorInfo[parti] & PVF_FIRST) { 3264 assert(VecIdx == -1 && "Orphaned vector."); 3265 VecIdx = parti; 3266 } 3267 3268 // That's the last element of this store op. 3269 if (VectorInfo[parti] & PVF_LAST) { 3270 unsigned NumElts = parti - VecIdx + 1; 3271 EVT EltVT = VTs[parti]; 3272 // i1 is loaded/stored as i8. 3273 EVT LoadVT = EltVT; 3274 if (EltVT == MVT::i1) 3275 LoadVT = MVT::i8; 3276 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) 3277 // getLoad needs a vector type, but it can't handle 3278 // vectors which contain v2f16 or v2bf16 elements. So we must load 3279 // using i32 here and then bitcast back. 3280 LoadVT = MVT::i32; 3281 3282 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 3283 SDValue VecAddr = 3284 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 3285 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 3286 Value *srcValue = Constant::getNullValue(PointerType::get( 3287 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 3288 3289 const MaybeAlign PartAlign = [&]() -> MaybeAlign { 3290 if (aggregateIsPacked) 3291 return Align(1); 3292 if (NumElts != 1) 3293 return std::nullopt; 3294 Align PartAlign = 3295 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); 3296 return commonAlignment(PartAlign, Offsets[parti]); 3297 }(); 3298 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 3299 MachinePointerInfo(srcValue), PartAlign, 3300 MachineMemOperand::MODereferenceable | 3301 MachineMemOperand::MOInvariant); 3302 if (P.getNode()) 3303 P.getNode()->setIROrder(i + 1); 3304 for (unsigned j = 0; j < NumElts; ++j) { 3305 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 3306 DAG.getIntPtrConstant(j, dl)); 3307 // We've loaded i1 as an i8 and now must truncate it back to i1 3308 if (EltVT == MVT::i1) 3309 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 3310 // v2f16 was loaded as an i32. Now we must bitcast it back. 3311 else if (EltVT != LoadVT) 3312 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 3313 3314 // If a promoted integer type is used, truncate down to the original 3315 MVT PromotedVT; 3316 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 3317 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 3318 } 3319 3320 // Extend the element if necessary (e.g. an i8 is loaded 3321 // into an i16 register) 3322 if (Ins[InsIdx].VT.isInteger() && 3323 Ins[InsIdx].VT.getFixedSizeInBits() > 3324 LoadVT.getFixedSizeInBits()) { 3325 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 3326 : ISD::ZERO_EXTEND; 3327 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 3328 } 3329 InVals.push_back(Elt); 3330 } 3331 3332 // Reset vector tracking state. 3333 VecIdx = -1; 3334 } 3335 ++InsIdx; 3336 } 3337 if (VTs.size() > 0) 3338 --InsIdx; 3339 continue; 3340 } 3341 3342 // Param has ByVal attribute 3343 // Return MoveParam(param symbol). 3344 // Ideally, the param symbol can be returned directly, 3345 // but when SDNode builder decides to use it in a CopyToReg(), 3346 // machine instruction fails because TargetExternalSymbol 3347 // (not lowered) is target dependent, and CopyToReg assumes 3348 // the source is lowered. 3349 EVT ObjectVT = getValueType(DL, Ty); 3350 assert(ObjectVT == Ins[InsIdx].VT && 3351 "Ins type did not match function type"); 3352 SDValue Arg = getParamSymbol(DAG, i, PtrVT); 3353 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 3354 if (p.getNode()) 3355 p.getNode()->setIROrder(i + 1); 3356 InVals.push_back(p); 3357 } 3358 3359 if (!OutChains.empty()) 3360 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 3361 3362 return Chain; 3363 } 3364 3365 // Use byte-store when the param adress of the return value is unaligned. 3366 // This may happen when the return value is a field of a packed structure. 3367 static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, 3368 uint64_t Offset, EVT ElementType, 3369 SDValue RetVal, const SDLoc &dl) { 3370 // Bit logic only works on integer types 3371 if (adjustElementType(ElementType)) 3372 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); 3373 3374 // Store each byte 3375 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 3376 // Shift the byte to the last byte position 3377 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, 3378 DAG.getConstant(i * 8, dl, MVT::i32)); 3379 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), 3380 ShiftVal}; 3381 // Trunc store only the last byte by using 3382 // st.param.b8 3383 // The register type can be larger than b8. 3384 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, 3385 DAG.getVTList(MVT::Other), StoreOperands, 3386 MVT::i8, MachinePointerInfo(), std::nullopt, 3387 MachineMemOperand::MOStore); 3388 } 3389 return Chain; 3390 } 3391 3392 SDValue 3393 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3394 bool isVarArg, 3395 const SmallVectorImpl<ISD::OutputArg> &Outs, 3396 const SmallVectorImpl<SDValue> &OutVals, 3397 const SDLoc &dl, SelectionDAG &DAG) const { 3398 const MachineFunction &MF = DAG.getMachineFunction(); 3399 const Function &F = MF.getFunction(); 3400 Type *RetTy = MF.getFunction().getReturnType(); 3401 3402 bool isABI = (STI.getSmVersion() >= 20); 3403 assert(isABI && "Non-ABI compilation is not supported"); 3404 if (!isABI) 3405 return Chain; 3406 3407 const DataLayout &DL = DAG.getDataLayout(); 3408 SmallVector<SDValue, 16> PromotedOutVals; 3409 SmallVector<EVT, 16> VTs; 3410 SmallVector<uint64_t, 16> Offsets; 3411 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 3412 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 3413 3414 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3415 SDValue PromotedOutVal = OutVals[i]; 3416 MVT PromotedVT; 3417 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 3418 VTs[i] = EVT(PromotedVT); 3419 } 3420 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 3421 llvm::ISD::NodeType Ext = 3422 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3423 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 3424 } 3425 PromotedOutVals.push_back(PromotedOutVal); 3426 } 3427 3428 auto VectorInfo = VectorizePTXValueVTs( 3429 VTs, Offsets, 3430 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 3431 : Align(1)); 3432 3433 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3434 // 32-bits are sign extended or zero extended, depending on whether 3435 // they are signed or unsigned types. 3436 bool ExtendIntegerRetVal = 3437 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 3438 3439 SmallVector<SDValue, 6> StoreOperands; 3440 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3441 SDValue OutVal = OutVals[i]; 3442 SDValue RetVal = PromotedOutVals[i]; 3443 3444 if (ExtendIntegerRetVal) { 3445 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 3446 : ISD::ZERO_EXTEND, 3447 dl, MVT::i32, RetVal); 3448 } else if (OutVal.getValueSizeInBits() < 16) { 3449 // Use 16-bit registers for small load-stores as it's the 3450 // smallest general purpose register size supported by NVPTX. 3451 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 3452 } 3453 3454 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned 3455 // for a scalar store. In such cases, fall back to byte stores. 3456 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { 3457 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3458 Align ElementTypeAlign = 3459 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); 3460 Align ElementAlign = 3461 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); 3462 if (ElementAlign < ElementTypeAlign) { 3463 assert(StoreOperands.empty() && "Orphaned operand list."); 3464 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, 3465 RetVal, dl); 3466 3467 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes 3468 // into the graph, so just move on to the next element. 3469 continue; 3470 } 3471 } 3472 3473 // New load/store. Record chain and offset operands. 3474 if (VectorInfo[i] & PVF_FIRST) { 3475 assert(StoreOperands.empty() && "Orphaned operand list."); 3476 StoreOperands.push_back(Chain); 3477 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 3478 } 3479 3480 // Record the value to return. 3481 StoreOperands.push_back(RetVal); 3482 3483 // That's the last element of this store op. 3484 if (VectorInfo[i] & PVF_LAST) { 3485 NVPTXISD::NodeType Op; 3486 unsigned NumElts = StoreOperands.size() - 2; 3487 switch (NumElts) { 3488 case 1: 3489 Op = NVPTXISD::StoreRetval; 3490 break; 3491 case 2: 3492 Op = NVPTXISD::StoreRetvalV2; 3493 break; 3494 case 4: 3495 Op = NVPTXISD::StoreRetvalV4; 3496 break; 3497 default: 3498 llvm_unreachable("Invalid vector info."); 3499 } 3500 3501 // Adjust type of load/store op if we've extended the scalar 3502 // return value. 3503 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3504 Chain = DAG.getMemIntrinsicNode( 3505 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 3506 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 3507 // Cleanup vector state. 3508 StoreOperands.clear(); 3509 } 3510 } 3511 3512 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 3513 } 3514 3515 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 3516 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 3517 SelectionDAG &DAG) const { 3518 if (Constraint.size() > 1) 3519 return; 3520 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3521 } 3522 3523 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 3524 switch (Intrinsic) { 3525 default: 3526 return 0; 3527 3528 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3529 return NVPTXISD::Tex1DFloatS32; 3530 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3531 return NVPTXISD::Tex1DFloatFloat; 3532 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3533 return NVPTXISD::Tex1DFloatFloatLevel; 3534 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3535 return NVPTXISD::Tex1DFloatFloatGrad; 3536 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3537 return NVPTXISD::Tex1DS32S32; 3538 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3539 return NVPTXISD::Tex1DS32Float; 3540 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3541 return NVPTXISD::Tex1DS32FloatLevel; 3542 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3543 return NVPTXISD::Tex1DS32FloatGrad; 3544 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3545 return NVPTXISD::Tex1DU32S32; 3546 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3547 return NVPTXISD::Tex1DU32Float; 3548 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3549 return NVPTXISD::Tex1DU32FloatLevel; 3550 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3551 return NVPTXISD::Tex1DU32FloatGrad; 3552 3553 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3554 return NVPTXISD::Tex1DArrayFloatS32; 3555 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3556 return NVPTXISD::Tex1DArrayFloatFloat; 3557 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3558 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3559 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3560 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3561 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3562 return NVPTXISD::Tex1DArrayS32S32; 3563 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3564 return NVPTXISD::Tex1DArrayS32Float; 3565 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3566 return NVPTXISD::Tex1DArrayS32FloatLevel; 3567 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3568 return NVPTXISD::Tex1DArrayS32FloatGrad; 3569 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3570 return NVPTXISD::Tex1DArrayU32S32; 3571 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3572 return NVPTXISD::Tex1DArrayU32Float; 3573 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3574 return NVPTXISD::Tex1DArrayU32FloatLevel; 3575 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3576 return NVPTXISD::Tex1DArrayU32FloatGrad; 3577 3578 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3579 return NVPTXISD::Tex2DFloatS32; 3580 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3581 return NVPTXISD::Tex2DFloatFloat; 3582 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3583 return NVPTXISD::Tex2DFloatFloatLevel; 3584 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3585 return NVPTXISD::Tex2DFloatFloatGrad; 3586 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3587 return NVPTXISD::Tex2DS32S32; 3588 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3589 return NVPTXISD::Tex2DS32Float; 3590 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3591 return NVPTXISD::Tex2DS32FloatLevel; 3592 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3593 return NVPTXISD::Tex2DS32FloatGrad; 3594 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3595 return NVPTXISD::Tex2DU32S32; 3596 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3597 return NVPTXISD::Tex2DU32Float; 3598 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3599 return NVPTXISD::Tex2DU32FloatLevel; 3600 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3601 return NVPTXISD::Tex2DU32FloatGrad; 3602 3603 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3604 return NVPTXISD::Tex2DArrayFloatS32; 3605 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3606 return NVPTXISD::Tex2DArrayFloatFloat; 3607 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3608 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3609 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3610 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3611 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3612 return NVPTXISD::Tex2DArrayS32S32; 3613 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3614 return NVPTXISD::Tex2DArrayS32Float; 3615 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3616 return NVPTXISD::Tex2DArrayS32FloatLevel; 3617 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3618 return NVPTXISD::Tex2DArrayS32FloatGrad; 3619 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3620 return NVPTXISD::Tex2DArrayU32S32; 3621 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3622 return NVPTXISD::Tex2DArrayU32Float; 3623 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3624 return NVPTXISD::Tex2DArrayU32FloatLevel; 3625 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3626 return NVPTXISD::Tex2DArrayU32FloatGrad; 3627 3628 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3629 return NVPTXISD::Tex3DFloatS32; 3630 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3631 return NVPTXISD::Tex3DFloatFloat; 3632 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3633 return NVPTXISD::Tex3DFloatFloatLevel; 3634 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3635 return NVPTXISD::Tex3DFloatFloatGrad; 3636 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3637 return NVPTXISD::Tex3DS32S32; 3638 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3639 return NVPTXISD::Tex3DS32Float; 3640 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3641 return NVPTXISD::Tex3DS32FloatLevel; 3642 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3643 return NVPTXISD::Tex3DS32FloatGrad; 3644 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3645 return NVPTXISD::Tex3DU32S32; 3646 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3647 return NVPTXISD::Tex3DU32Float; 3648 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3649 return NVPTXISD::Tex3DU32FloatLevel; 3650 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3651 return NVPTXISD::Tex3DU32FloatGrad; 3652 3653 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3654 return NVPTXISD::TexCubeFloatFloat; 3655 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3656 return NVPTXISD::TexCubeFloatFloatLevel; 3657 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3658 return NVPTXISD::TexCubeS32Float; 3659 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3660 return NVPTXISD::TexCubeS32FloatLevel; 3661 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3662 return NVPTXISD::TexCubeU32Float; 3663 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3664 return NVPTXISD::TexCubeU32FloatLevel; 3665 3666 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3667 return NVPTXISD::TexCubeArrayFloatFloat; 3668 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3669 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3670 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3671 return NVPTXISD::TexCubeArrayS32Float; 3672 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3673 return NVPTXISD::TexCubeArrayS32FloatLevel; 3674 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3675 return NVPTXISD::TexCubeArrayU32Float; 3676 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3677 return NVPTXISD::TexCubeArrayU32FloatLevel; 3678 3679 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3680 return NVPTXISD::Tld4R2DFloatFloat; 3681 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3682 return NVPTXISD::Tld4G2DFloatFloat; 3683 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3684 return NVPTXISD::Tld4B2DFloatFloat; 3685 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3686 return NVPTXISD::Tld4A2DFloatFloat; 3687 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3688 return NVPTXISD::Tld4R2DS64Float; 3689 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3690 return NVPTXISD::Tld4G2DS64Float; 3691 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3692 return NVPTXISD::Tld4B2DS64Float; 3693 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3694 return NVPTXISD::Tld4A2DS64Float; 3695 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3696 return NVPTXISD::Tld4R2DU64Float; 3697 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3698 return NVPTXISD::Tld4G2DU64Float; 3699 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3700 return NVPTXISD::Tld4B2DU64Float; 3701 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3702 return NVPTXISD::Tld4A2DU64Float; 3703 3704 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3705 return NVPTXISD::TexUnified1DFloatS32; 3706 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3707 return NVPTXISD::TexUnified1DFloatFloat; 3708 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3709 return NVPTXISD::TexUnified1DFloatFloatLevel; 3710 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3711 return NVPTXISD::TexUnified1DFloatFloatGrad; 3712 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3713 return NVPTXISD::TexUnified1DS32S32; 3714 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3715 return NVPTXISD::TexUnified1DS32Float; 3716 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3717 return NVPTXISD::TexUnified1DS32FloatLevel; 3718 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3719 return NVPTXISD::TexUnified1DS32FloatGrad; 3720 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3721 return NVPTXISD::TexUnified1DU32S32; 3722 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3723 return NVPTXISD::TexUnified1DU32Float; 3724 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3725 return NVPTXISD::TexUnified1DU32FloatLevel; 3726 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3727 return NVPTXISD::TexUnified1DU32FloatGrad; 3728 3729 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3730 return NVPTXISD::TexUnified1DArrayFloatS32; 3731 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3732 return NVPTXISD::TexUnified1DArrayFloatFloat; 3733 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3734 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3735 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3736 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3737 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3738 return NVPTXISD::TexUnified1DArrayS32S32; 3739 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3740 return NVPTXISD::TexUnified1DArrayS32Float; 3741 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3742 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3743 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3744 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3745 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3746 return NVPTXISD::TexUnified1DArrayU32S32; 3747 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3748 return NVPTXISD::TexUnified1DArrayU32Float; 3749 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3750 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3751 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3752 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3753 3754 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3755 return NVPTXISD::TexUnified2DFloatS32; 3756 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3757 return NVPTXISD::TexUnified2DFloatFloat; 3758 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3759 return NVPTXISD::TexUnified2DFloatFloatLevel; 3760 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3761 return NVPTXISD::TexUnified2DFloatFloatGrad; 3762 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3763 return NVPTXISD::TexUnified2DS32S32; 3764 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3765 return NVPTXISD::TexUnified2DS32Float; 3766 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3767 return NVPTXISD::TexUnified2DS32FloatLevel; 3768 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3769 return NVPTXISD::TexUnified2DS32FloatGrad; 3770 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3771 return NVPTXISD::TexUnified2DU32S32; 3772 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3773 return NVPTXISD::TexUnified2DU32Float; 3774 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3775 return NVPTXISD::TexUnified2DU32FloatLevel; 3776 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3777 return NVPTXISD::TexUnified2DU32FloatGrad; 3778 3779 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3780 return NVPTXISD::TexUnified2DArrayFloatS32; 3781 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3782 return NVPTXISD::TexUnified2DArrayFloatFloat; 3783 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3784 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3785 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3786 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3787 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3788 return NVPTXISD::TexUnified2DArrayS32S32; 3789 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3790 return NVPTXISD::TexUnified2DArrayS32Float; 3791 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3792 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3793 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3794 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3795 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3796 return NVPTXISD::TexUnified2DArrayU32S32; 3797 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3798 return NVPTXISD::TexUnified2DArrayU32Float; 3799 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3800 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3801 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3802 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3803 3804 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3805 return NVPTXISD::TexUnified3DFloatS32; 3806 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3807 return NVPTXISD::TexUnified3DFloatFloat; 3808 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3809 return NVPTXISD::TexUnified3DFloatFloatLevel; 3810 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3811 return NVPTXISD::TexUnified3DFloatFloatGrad; 3812 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3813 return NVPTXISD::TexUnified3DS32S32; 3814 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3815 return NVPTXISD::TexUnified3DS32Float; 3816 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3817 return NVPTXISD::TexUnified3DS32FloatLevel; 3818 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3819 return NVPTXISD::TexUnified3DS32FloatGrad; 3820 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3821 return NVPTXISD::TexUnified3DU32S32; 3822 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3823 return NVPTXISD::TexUnified3DU32Float; 3824 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3825 return NVPTXISD::TexUnified3DU32FloatLevel; 3826 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3827 return NVPTXISD::TexUnified3DU32FloatGrad; 3828 3829 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3830 return NVPTXISD::TexUnifiedCubeFloatFloat; 3831 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3832 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3833 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3834 return NVPTXISD::TexUnifiedCubeS32Float; 3835 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3836 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3837 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3838 return NVPTXISD::TexUnifiedCubeU32Float; 3839 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3840 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3841 3842 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3843 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3844 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3845 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3846 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3847 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3848 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3849 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3850 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3851 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3852 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3853 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3854 3855 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 3856 return NVPTXISD::TexUnifiedCubeFloatFloatGrad; 3857 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 3858 return NVPTXISD::TexUnifiedCubeS32FloatGrad; 3859 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 3860 return NVPTXISD::TexUnifiedCubeU32FloatGrad; 3861 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 3862 return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad; 3863 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 3864 return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad; 3865 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 3866 return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad; 3867 3868 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3869 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3870 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3871 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3872 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3873 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3874 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3875 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3876 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3877 return NVPTXISD::Tld4UnifiedR2DS64Float; 3878 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3879 return NVPTXISD::Tld4UnifiedG2DS64Float; 3880 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3881 return NVPTXISD::Tld4UnifiedB2DS64Float; 3882 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3883 return NVPTXISD::Tld4UnifiedA2DS64Float; 3884 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3885 return NVPTXISD::Tld4UnifiedR2DU64Float; 3886 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3887 return NVPTXISD::Tld4UnifiedG2DU64Float; 3888 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3889 return NVPTXISD::Tld4UnifiedB2DU64Float; 3890 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3891 return NVPTXISD::Tld4UnifiedA2DU64Float; 3892 } 3893 } 3894 3895 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3896 switch (Intrinsic) { 3897 default: 3898 return 0; 3899 case Intrinsic::nvvm_suld_1d_i8_clamp: 3900 return NVPTXISD::Suld1DI8Clamp; 3901 case Intrinsic::nvvm_suld_1d_i16_clamp: 3902 return NVPTXISD::Suld1DI16Clamp; 3903 case Intrinsic::nvvm_suld_1d_i32_clamp: 3904 return NVPTXISD::Suld1DI32Clamp; 3905 case Intrinsic::nvvm_suld_1d_i64_clamp: 3906 return NVPTXISD::Suld1DI64Clamp; 3907 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3908 return NVPTXISD::Suld1DV2I8Clamp; 3909 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3910 return NVPTXISD::Suld1DV2I16Clamp; 3911 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3912 return NVPTXISD::Suld1DV2I32Clamp; 3913 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3914 return NVPTXISD::Suld1DV2I64Clamp; 3915 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3916 return NVPTXISD::Suld1DV4I8Clamp; 3917 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3918 return NVPTXISD::Suld1DV4I16Clamp; 3919 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3920 return NVPTXISD::Suld1DV4I32Clamp; 3921 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3922 return NVPTXISD::Suld1DArrayI8Clamp; 3923 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3924 return NVPTXISD::Suld1DArrayI16Clamp; 3925 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3926 return NVPTXISD::Suld1DArrayI32Clamp; 3927 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3928 return NVPTXISD::Suld1DArrayI64Clamp; 3929 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3930 return NVPTXISD::Suld1DArrayV2I8Clamp; 3931 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3932 return NVPTXISD::Suld1DArrayV2I16Clamp; 3933 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3934 return NVPTXISD::Suld1DArrayV2I32Clamp; 3935 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3936 return NVPTXISD::Suld1DArrayV2I64Clamp; 3937 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3938 return NVPTXISD::Suld1DArrayV4I8Clamp; 3939 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3940 return NVPTXISD::Suld1DArrayV4I16Clamp; 3941 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3942 return NVPTXISD::Suld1DArrayV4I32Clamp; 3943 case Intrinsic::nvvm_suld_2d_i8_clamp: 3944 return NVPTXISD::Suld2DI8Clamp; 3945 case Intrinsic::nvvm_suld_2d_i16_clamp: 3946 return NVPTXISD::Suld2DI16Clamp; 3947 case Intrinsic::nvvm_suld_2d_i32_clamp: 3948 return NVPTXISD::Suld2DI32Clamp; 3949 case Intrinsic::nvvm_suld_2d_i64_clamp: 3950 return NVPTXISD::Suld2DI64Clamp; 3951 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3952 return NVPTXISD::Suld2DV2I8Clamp; 3953 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3954 return NVPTXISD::Suld2DV2I16Clamp; 3955 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3956 return NVPTXISD::Suld2DV2I32Clamp; 3957 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3958 return NVPTXISD::Suld2DV2I64Clamp; 3959 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3960 return NVPTXISD::Suld2DV4I8Clamp; 3961 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3962 return NVPTXISD::Suld2DV4I16Clamp; 3963 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3964 return NVPTXISD::Suld2DV4I32Clamp; 3965 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3966 return NVPTXISD::Suld2DArrayI8Clamp; 3967 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3968 return NVPTXISD::Suld2DArrayI16Clamp; 3969 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3970 return NVPTXISD::Suld2DArrayI32Clamp; 3971 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3972 return NVPTXISD::Suld2DArrayI64Clamp; 3973 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3974 return NVPTXISD::Suld2DArrayV2I8Clamp; 3975 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3976 return NVPTXISD::Suld2DArrayV2I16Clamp; 3977 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3978 return NVPTXISD::Suld2DArrayV2I32Clamp; 3979 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3980 return NVPTXISD::Suld2DArrayV2I64Clamp; 3981 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3982 return NVPTXISD::Suld2DArrayV4I8Clamp; 3983 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3984 return NVPTXISD::Suld2DArrayV4I16Clamp; 3985 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3986 return NVPTXISD::Suld2DArrayV4I32Clamp; 3987 case Intrinsic::nvvm_suld_3d_i8_clamp: 3988 return NVPTXISD::Suld3DI8Clamp; 3989 case Intrinsic::nvvm_suld_3d_i16_clamp: 3990 return NVPTXISD::Suld3DI16Clamp; 3991 case Intrinsic::nvvm_suld_3d_i32_clamp: 3992 return NVPTXISD::Suld3DI32Clamp; 3993 case Intrinsic::nvvm_suld_3d_i64_clamp: 3994 return NVPTXISD::Suld3DI64Clamp; 3995 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3996 return NVPTXISD::Suld3DV2I8Clamp; 3997 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3998 return NVPTXISD::Suld3DV2I16Clamp; 3999 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 4000 return NVPTXISD::Suld3DV2I32Clamp; 4001 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 4002 return NVPTXISD::Suld3DV2I64Clamp; 4003 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4004 return NVPTXISD::Suld3DV4I8Clamp; 4005 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4006 return NVPTXISD::Suld3DV4I16Clamp; 4007 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4008 return NVPTXISD::Suld3DV4I32Clamp; 4009 case Intrinsic::nvvm_suld_1d_i8_trap: 4010 return NVPTXISD::Suld1DI8Trap; 4011 case Intrinsic::nvvm_suld_1d_i16_trap: 4012 return NVPTXISD::Suld1DI16Trap; 4013 case Intrinsic::nvvm_suld_1d_i32_trap: 4014 return NVPTXISD::Suld1DI32Trap; 4015 case Intrinsic::nvvm_suld_1d_i64_trap: 4016 return NVPTXISD::Suld1DI64Trap; 4017 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4018 return NVPTXISD::Suld1DV2I8Trap; 4019 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4020 return NVPTXISD::Suld1DV2I16Trap; 4021 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4022 return NVPTXISD::Suld1DV2I32Trap; 4023 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4024 return NVPTXISD::Suld1DV2I64Trap; 4025 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4026 return NVPTXISD::Suld1DV4I8Trap; 4027 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4028 return NVPTXISD::Suld1DV4I16Trap; 4029 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4030 return NVPTXISD::Suld1DV4I32Trap; 4031 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4032 return NVPTXISD::Suld1DArrayI8Trap; 4033 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4034 return NVPTXISD::Suld1DArrayI16Trap; 4035 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4036 return NVPTXISD::Suld1DArrayI32Trap; 4037 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4038 return NVPTXISD::Suld1DArrayI64Trap; 4039 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4040 return NVPTXISD::Suld1DArrayV2I8Trap; 4041 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4042 return NVPTXISD::Suld1DArrayV2I16Trap; 4043 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4044 return NVPTXISD::Suld1DArrayV2I32Trap; 4045 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4046 return NVPTXISD::Suld1DArrayV2I64Trap; 4047 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4048 return NVPTXISD::Suld1DArrayV4I8Trap; 4049 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4050 return NVPTXISD::Suld1DArrayV4I16Trap; 4051 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4052 return NVPTXISD::Suld1DArrayV4I32Trap; 4053 case Intrinsic::nvvm_suld_2d_i8_trap: 4054 return NVPTXISD::Suld2DI8Trap; 4055 case Intrinsic::nvvm_suld_2d_i16_trap: 4056 return NVPTXISD::Suld2DI16Trap; 4057 case Intrinsic::nvvm_suld_2d_i32_trap: 4058 return NVPTXISD::Suld2DI32Trap; 4059 case Intrinsic::nvvm_suld_2d_i64_trap: 4060 return NVPTXISD::Suld2DI64Trap; 4061 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4062 return NVPTXISD::Suld2DV2I8Trap; 4063 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4064 return NVPTXISD::Suld2DV2I16Trap; 4065 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4066 return NVPTXISD::Suld2DV2I32Trap; 4067 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4068 return NVPTXISD::Suld2DV2I64Trap; 4069 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4070 return NVPTXISD::Suld2DV4I8Trap; 4071 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4072 return NVPTXISD::Suld2DV4I16Trap; 4073 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4074 return NVPTXISD::Suld2DV4I32Trap; 4075 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4076 return NVPTXISD::Suld2DArrayI8Trap; 4077 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4078 return NVPTXISD::Suld2DArrayI16Trap; 4079 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4080 return NVPTXISD::Suld2DArrayI32Trap; 4081 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4082 return NVPTXISD::Suld2DArrayI64Trap; 4083 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4084 return NVPTXISD::Suld2DArrayV2I8Trap; 4085 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4086 return NVPTXISD::Suld2DArrayV2I16Trap; 4087 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4088 return NVPTXISD::Suld2DArrayV2I32Trap; 4089 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4090 return NVPTXISD::Suld2DArrayV2I64Trap; 4091 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4092 return NVPTXISD::Suld2DArrayV4I8Trap; 4093 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4094 return NVPTXISD::Suld2DArrayV4I16Trap; 4095 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4096 return NVPTXISD::Suld2DArrayV4I32Trap; 4097 case Intrinsic::nvvm_suld_3d_i8_trap: 4098 return NVPTXISD::Suld3DI8Trap; 4099 case Intrinsic::nvvm_suld_3d_i16_trap: 4100 return NVPTXISD::Suld3DI16Trap; 4101 case Intrinsic::nvvm_suld_3d_i32_trap: 4102 return NVPTXISD::Suld3DI32Trap; 4103 case Intrinsic::nvvm_suld_3d_i64_trap: 4104 return NVPTXISD::Suld3DI64Trap; 4105 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4106 return NVPTXISD::Suld3DV2I8Trap; 4107 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4108 return NVPTXISD::Suld3DV2I16Trap; 4109 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4110 return NVPTXISD::Suld3DV2I32Trap; 4111 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4112 return NVPTXISD::Suld3DV2I64Trap; 4113 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4114 return NVPTXISD::Suld3DV4I8Trap; 4115 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4116 return NVPTXISD::Suld3DV4I16Trap; 4117 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4118 return NVPTXISD::Suld3DV4I32Trap; 4119 case Intrinsic::nvvm_suld_1d_i8_zero: 4120 return NVPTXISD::Suld1DI8Zero; 4121 case Intrinsic::nvvm_suld_1d_i16_zero: 4122 return NVPTXISD::Suld1DI16Zero; 4123 case Intrinsic::nvvm_suld_1d_i32_zero: 4124 return NVPTXISD::Suld1DI32Zero; 4125 case Intrinsic::nvvm_suld_1d_i64_zero: 4126 return NVPTXISD::Suld1DI64Zero; 4127 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4128 return NVPTXISD::Suld1DV2I8Zero; 4129 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4130 return NVPTXISD::Suld1DV2I16Zero; 4131 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4132 return NVPTXISD::Suld1DV2I32Zero; 4133 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4134 return NVPTXISD::Suld1DV2I64Zero; 4135 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4136 return NVPTXISD::Suld1DV4I8Zero; 4137 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4138 return NVPTXISD::Suld1DV4I16Zero; 4139 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4140 return NVPTXISD::Suld1DV4I32Zero; 4141 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4142 return NVPTXISD::Suld1DArrayI8Zero; 4143 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4144 return NVPTXISD::Suld1DArrayI16Zero; 4145 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4146 return NVPTXISD::Suld1DArrayI32Zero; 4147 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4148 return NVPTXISD::Suld1DArrayI64Zero; 4149 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4150 return NVPTXISD::Suld1DArrayV2I8Zero; 4151 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4152 return NVPTXISD::Suld1DArrayV2I16Zero; 4153 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4154 return NVPTXISD::Suld1DArrayV2I32Zero; 4155 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4156 return NVPTXISD::Suld1DArrayV2I64Zero; 4157 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4158 return NVPTXISD::Suld1DArrayV4I8Zero; 4159 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4160 return NVPTXISD::Suld1DArrayV4I16Zero; 4161 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4162 return NVPTXISD::Suld1DArrayV4I32Zero; 4163 case Intrinsic::nvvm_suld_2d_i8_zero: 4164 return NVPTXISD::Suld2DI8Zero; 4165 case Intrinsic::nvvm_suld_2d_i16_zero: 4166 return NVPTXISD::Suld2DI16Zero; 4167 case Intrinsic::nvvm_suld_2d_i32_zero: 4168 return NVPTXISD::Suld2DI32Zero; 4169 case Intrinsic::nvvm_suld_2d_i64_zero: 4170 return NVPTXISD::Suld2DI64Zero; 4171 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4172 return NVPTXISD::Suld2DV2I8Zero; 4173 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4174 return NVPTXISD::Suld2DV2I16Zero; 4175 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4176 return NVPTXISD::Suld2DV2I32Zero; 4177 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4178 return NVPTXISD::Suld2DV2I64Zero; 4179 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4180 return NVPTXISD::Suld2DV4I8Zero; 4181 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4182 return NVPTXISD::Suld2DV4I16Zero; 4183 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4184 return NVPTXISD::Suld2DV4I32Zero; 4185 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4186 return NVPTXISD::Suld2DArrayI8Zero; 4187 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4188 return NVPTXISD::Suld2DArrayI16Zero; 4189 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4190 return NVPTXISD::Suld2DArrayI32Zero; 4191 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4192 return NVPTXISD::Suld2DArrayI64Zero; 4193 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4194 return NVPTXISD::Suld2DArrayV2I8Zero; 4195 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4196 return NVPTXISD::Suld2DArrayV2I16Zero; 4197 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4198 return NVPTXISD::Suld2DArrayV2I32Zero; 4199 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4200 return NVPTXISD::Suld2DArrayV2I64Zero; 4201 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4202 return NVPTXISD::Suld2DArrayV4I8Zero; 4203 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4204 return NVPTXISD::Suld2DArrayV4I16Zero; 4205 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4206 return NVPTXISD::Suld2DArrayV4I32Zero; 4207 case Intrinsic::nvvm_suld_3d_i8_zero: 4208 return NVPTXISD::Suld3DI8Zero; 4209 case Intrinsic::nvvm_suld_3d_i16_zero: 4210 return NVPTXISD::Suld3DI16Zero; 4211 case Intrinsic::nvvm_suld_3d_i32_zero: 4212 return NVPTXISD::Suld3DI32Zero; 4213 case Intrinsic::nvvm_suld_3d_i64_zero: 4214 return NVPTXISD::Suld3DI64Zero; 4215 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4216 return NVPTXISD::Suld3DV2I8Zero; 4217 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4218 return NVPTXISD::Suld3DV2I16Zero; 4219 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4220 return NVPTXISD::Suld3DV2I32Zero; 4221 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4222 return NVPTXISD::Suld3DV2I64Zero; 4223 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4224 return NVPTXISD::Suld3DV4I8Zero; 4225 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4226 return NVPTXISD::Suld3DV4I16Zero; 4227 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4228 return NVPTXISD::Suld3DV4I32Zero; 4229 } 4230 } 4231 4232 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 4233 // TgtMemIntrinsic 4234 // because we need the information that is only available in the "Value" type 4235 // of destination 4236 // pointer. In particular, the address space information. 4237 bool NVPTXTargetLowering::getTgtMemIntrinsic( 4238 IntrinsicInfo &Info, const CallInst &I, 4239 MachineFunction &MF, unsigned Intrinsic) const { 4240 switch (Intrinsic) { 4241 default: 4242 return false; 4243 case Intrinsic::nvvm_match_all_sync_i32p: 4244 case Intrinsic::nvvm_match_all_sync_i64p: 4245 Info.opc = ISD::INTRINSIC_W_CHAIN; 4246 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 4247 // in order to model data exchange with other threads, but perform no real 4248 // memory accesses. 4249 Info.memVT = MVT::i1; 4250 4251 // Our result depends on both our and other thread's arguments. 4252 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4253 return true; 4254 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 4255 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 4256 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 4257 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 4258 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 4259 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 4260 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 4261 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 4262 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 4263 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 4264 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 4265 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 4266 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 4267 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 4268 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 4269 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 4270 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 4271 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 4272 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 4273 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 4274 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 4275 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 4276 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 4277 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 4278 Info.opc = ISD::INTRINSIC_W_CHAIN; 4279 Info.memVT = MVT::v8f16; 4280 Info.ptrVal = I.getArgOperand(0); 4281 Info.offset = 0; 4282 Info.flags = MachineMemOperand::MOLoad; 4283 Info.align = Align(16); 4284 return true; 4285 } 4286 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 4287 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 4288 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 4289 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 4290 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 4291 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 4292 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 4293 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 4294 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 4295 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 4296 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 4297 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 4298 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 4299 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 4300 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 4301 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 4302 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 4303 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 4304 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 4305 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 4306 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 4307 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 4308 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 4309 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 4310 Info.opc = ISD::INTRINSIC_W_CHAIN; 4311 Info.memVT = MVT::v2i32; 4312 Info.ptrVal = I.getArgOperand(0); 4313 Info.offset = 0; 4314 Info.flags = MachineMemOperand::MOLoad; 4315 Info.align = Align(8); 4316 return true; 4317 } 4318 4319 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 4320 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 4321 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 4322 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 4323 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 4324 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 4325 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 4326 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 4327 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 4328 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 4329 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 4330 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 4331 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 4332 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 4333 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 4334 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 4335 4336 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 4337 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 4338 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 4339 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 4340 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 4341 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 4342 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 4343 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 4344 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 4345 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 4346 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 4347 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 4348 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 4349 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 4350 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 4351 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 4352 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 4353 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 4354 Info.opc = ISD::INTRINSIC_W_CHAIN; 4355 Info.memVT = MVT::v4i32; 4356 Info.ptrVal = I.getArgOperand(0); 4357 Info.offset = 0; 4358 Info.flags = MachineMemOperand::MOLoad; 4359 Info.align = Align(16); 4360 return true; 4361 } 4362 4363 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 4364 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 4365 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 4366 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 4367 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 4368 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 4369 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 4370 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 4371 4372 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 4373 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 4374 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 4375 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 4376 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 4377 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 4378 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 4379 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 4380 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 4381 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 4382 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 4383 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 4384 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 4385 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 4386 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 4387 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 4388 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 4389 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 4390 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 4391 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 4392 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 4393 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 4394 Info.opc = ISD::INTRINSIC_W_CHAIN; 4395 Info.memVT = MVT::i32; 4396 Info.ptrVal = I.getArgOperand(0); 4397 Info.offset = 0; 4398 Info.flags = MachineMemOperand::MOLoad; 4399 Info.align = Align(4); 4400 return true; 4401 } 4402 4403 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 4404 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 4405 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 4406 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 4407 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 4408 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 4409 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 4410 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 4411 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 4412 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 4413 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 4414 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 4415 Info.opc = ISD::INTRINSIC_W_CHAIN; 4416 Info.memVT = MVT::v4f16; 4417 Info.ptrVal = I.getArgOperand(0); 4418 Info.offset = 0; 4419 Info.flags = MachineMemOperand::MOLoad; 4420 Info.align = Align(16); 4421 return true; 4422 } 4423 4424 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 4425 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 4426 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 4427 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 4428 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 4429 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 4430 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 4431 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 4432 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 4433 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 4434 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 4435 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 4436 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 4437 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 4438 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 4439 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 4440 Info.opc = ISD::INTRINSIC_W_CHAIN; 4441 Info.memVT = MVT::v8f32; 4442 Info.ptrVal = I.getArgOperand(0); 4443 Info.offset = 0; 4444 Info.flags = MachineMemOperand::MOLoad; 4445 Info.align = Align(16); 4446 return true; 4447 } 4448 4449 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 4450 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 4451 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 4452 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 4453 4454 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 4455 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 4456 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 4457 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 4458 4459 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 4460 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 4461 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 4462 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 4463 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 4464 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 4465 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 4466 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 4467 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 4468 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 4469 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 4470 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 4471 Info.opc = ISD::INTRINSIC_W_CHAIN; 4472 Info.memVT = MVT::v8i32; 4473 Info.ptrVal = I.getArgOperand(0); 4474 Info.offset = 0; 4475 Info.flags = MachineMemOperand::MOLoad; 4476 Info.align = Align(16); 4477 return true; 4478 } 4479 4480 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 4481 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 4482 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 4483 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 4484 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 4485 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 4486 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 4487 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 4488 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 4489 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 4490 Info.opc = ISD::INTRINSIC_W_CHAIN; 4491 Info.memVT = MVT::v2i32; 4492 Info.ptrVal = I.getArgOperand(0); 4493 Info.offset = 0; 4494 Info.flags = MachineMemOperand::MOLoad; 4495 Info.align = Align(8); 4496 return true; 4497 } 4498 4499 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 4500 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 4501 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 4502 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 4503 4504 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 4505 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 4506 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 4507 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 4508 Info.opc = ISD::INTRINSIC_W_CHAIN; 4509 Info.memVT = MVT::f64; 4510 Info.ptrVal = I.getArgOperand(0); 4511 Info.offset = 0; 4512 Info.flags = MachineMemOperand::MOLoad; 4513 Info.align = Align(8); 4514 return true; 4515 } 4516 4517 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 4518 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 4519 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 4520 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 4521 Info.opc = ISD::INTRINSIC_W_CHAIN; 4522 Info.memVT = MVT::v2f64; 4523 Info.ptrVal = I.getArgOperand(0); 4524 Info.offset = 0; 4525 Info.flags = MachineMemOperand::MOLoad; 4526 Info.align = Align(16); 4527 return true; 4528 } 4529 4530 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 4531 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 4532 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 4533 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 4534 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 4535 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 4536 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 4537 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 4538 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 4539 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 4540 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 4541 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 4542 Info.opc = ISD::INTRINSIC_VOID; 4543 Info.memVT = MVT::v4f16; 4544 Info.ptrVal = I.getArgOperand(0); 4545 Info.offset = 0; 4546 Info.flags = MachineMemOperand::MOStore; 4547 Info.align = Align(16); 4548 return true; 4549 } 4550 4551 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4552 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4553 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4554 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4555 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4556 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4557 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4558 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4559 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4560 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4561 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4562 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4563 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4564 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4565 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4566 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4567 Info.opc = ISD::INTRINSIC_VOID; 4568 Info.memVT = MVT::v8f32; 4569 Info.ptrVal = I.getArgOperand(0); 4570 Info.offset = 0; 4571 Info.flags = MachineMemOperand::MOStore; 4572 Info.align = Align(16); 4573 return true; 4574 } 4575 4576 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4577 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4578 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4579 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4580 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4581 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4582 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4583 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4584 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4585 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4586 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4587 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4588 Info.opc = ISD::INTRINSIC_VOID; 4589 Info.memVT = MVT::v8i32; 4590 Info.ptrVal = I.getArgOperand(0); 4591 Info.offset = 0; 4592 Info.flags = MachineMemOperand::MOStore; 4593 Info.align = Align(16); 4594 return true; 4595 } 4596 4597 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4598 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4599 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4600 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4601 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4602 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4603 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4604 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4605 Info.opc = ISD::INTRINSIC_VOID; 4606 Info.memVT = MVT::v2i32; 4607 Info.ptrVal = I.getArgOperand(0); 4608 Info.offset = 0; 4609 Info.flags = MachineMemOperand::MOStore; 4610 Info.align = Align(8); 4611 return true; 4612 } 4613 4614 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4615 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4616 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4617 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4618 Info.opc = ISD::INTRINSIC_VOID; 4619 Info.memVT = MVT::v2f64; 4620 Info.ptrVal = I.getArgOperand(0); 4621 Info.offset = 0; 4622 Info.flags = MachineMemOperand::MOStore; 4623 Info.align = Align(16); 4624 return true; 4625 } 4626 4627 case Intrinsic::nvvm_atomic_load_inc_32: 4628 case Intrinsic::nvvm_atomic_load_dec_32: 4629 4630 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4631 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4632 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4633 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4634 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4635 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4636 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4637 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4638 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4639 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4640 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4641 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4642 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4643 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4644 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4645 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4646 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4647 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4648 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4649 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4650 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4651 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4652 auto &DL = I.getDataLayout(); 4653 Info.opc = ISD::INTRINSIC_W_CHAIN; 4654 Info.memVT = getValueType(DL, I.getType()); 4655 Info.ptrVal = I.getArgOperand(0); 4656 Info.offset = 0; 4657 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4658 Info.align.reset(); 4659 return true; 4660 } 4661 4662 case Intrinsic::nvvm_ldu_global_i: 4663 case Intrinsic::nvvm_ldu_global_f: 4664 case Intrinsic::nvvm_ldu_global_p: { 4665 auto &DL = I.getDataLayout(); 4666 Info.opc = ISD::INTRINSIC_W_CHAIN; 4667 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4668 Info.memVT = getValueType(DL, I.getType()); 4669 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4670 Info.memVT = getPointerTy(DL); 4671 else 4672 Info.memVT = getValueType(DL, I.getType()); 4673 Info.ptrVal = I.getArgOperand(0); 4674 Info.offset = 0; 4675 Info.flags = MachineMemOperand::MOLoad; 4676 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4677 4678 return true; 4679 } 4680 case Intrinsic::nvvm_ldg_global_i: 4681 case Intrinsic::nvvm_ldg_global_f: 4682 case Intrinsic::nvvm_ldg_global_p: { 4683 auto &DL = I.getDataLayout(); 4684 4685 Info.opc = ISD::INTRINSIC_W_CHAIN; 4686 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4687 Info.memVT = getValueType(DL, I.getType()); 4688 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4689 Info.memVT = getPointerTy(DL); 4690 else 4691 Info.memVT = getValueType(DL, I.getType()); 4692 Info.ptrVal = I.getArgOperand(0); 4693 Info.offset = 0; 4694 Info.flags = MachineMemOperand::MOLoad; 4695 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4696 4697 return true; 4698 } 4699 4700 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4701 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4702 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4703 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4704 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4705 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4706 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4707 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4708 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4709 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4710 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4711 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4712 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4713 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4714 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4715 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4716 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4717 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4718 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4719 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4720 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4721 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4722 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4723 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4724 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4725 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4726 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4727 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4728 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4729 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4730 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4731 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4732 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4733 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4734 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4735 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4736 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4737 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4738 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4739 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4740 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4741 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4742 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4743 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4744 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4745 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4746 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4747 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4748 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4749 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4750 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4751 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4752 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 4753 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 4754 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4755 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4756 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4757 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4758 Info.opc = getOpcForTextureInstr(Intrinsic); 4759 Info.memVT = MVT::v4f32; 4760 Info.ptrVal = nullptr; 4761 Info.offset = 0; 4762 Info.flags = MachineMemOperand::MOLoad; 4763 Info.align = Align(16); 4764 return true; 4765 4766 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4767 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4768 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4769 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4770 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4771 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4772 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4773 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4774 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4775 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4776 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4777 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4778 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4779 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4780 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4781 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4782 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4783 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4784 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4785 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4786 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4787 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4788 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4789 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4790 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4791 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4792 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4793 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4794 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4795 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4796 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4797 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4798 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4799 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4800 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4801 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4802 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4803 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4804 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4805 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4806 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4807 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4808 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4809 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4810 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4811 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4812 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4813 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4814 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4815 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4816 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4817 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4818 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4819 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4820 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4821 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4822 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4823 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4824 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4825 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4826 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4827 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4828 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4829 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4830 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4831 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4832 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4833 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4834 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4835 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4836 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4837 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4838 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4839 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4840 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4841 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4842 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4843 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4844 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4845 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4846 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4847 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4848 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4849 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4850 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4851 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4852 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4853 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4854 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4855 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4856 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4857 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4858 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4859 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4860 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4861 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4862 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4863 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4864 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4865 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4866 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4867 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4868 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4869 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4870 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 4871 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 4872 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 4873 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 4874 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4875 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4876 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4877 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4878 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4879 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4880 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4881 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4882 Info.opc = getOpcForTextureInstr(Intrinsic); 4883 Info.memVT = MVT::v4i32; 4884 Info.ptrVal = nullptr; 4885 Info.offset = 0; 4886 Info.flags = MachineMemOperand::MOLoad; 4887 Info.align = Align(16); 4888 return true; 4889 4890 case Intrinsic::nvvm_suld_1d_i8_clamp: 4891 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4892 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4893 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4894 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4895 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4896 case Intrinsic::nvvm_suld_2d_i8_clamp: 4897 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4898 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4899 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4900 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4901 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4902 case Intrinsic::nvvm_suld_3d_i8_clamp: 4903 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4904 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4905 case Intrinsic::nvvm_suld_1d_i8_trap: 4906 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4907 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4908 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4909 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4910 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4911 case Intrinsic::nvvm_suld_2d_i8_trap: 4912 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4913 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4914 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4915 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4916 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4917 case Intrinsic::nvvm_suld_3d_i8_trap: 4918 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4919 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4920 case Intrinsic::nvvm_suld_1d_i8_zero: 4921 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4922 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4923 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4924 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4925 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4926 case Intrinsic::nvvm_suld_2d_i8_zero: 4927 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4928 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4929 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4930 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4931 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4932 case Intrinsic::nvvm_suld_3d_i8_zero: 4933 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4934 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4935 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4936 Info.memVT = MVT::i8; 4937 Info.ptrVal = nullptr; 4938 Info.offset = 0; 4939 Info.flags = MachineMemOperand::MOLoad; 4940 Info.align = Align(16); 4941 return true; 4942 4943 case Intrinsic::nvvm_suld_1d_i16_clamp: 4944 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4945 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4946 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4947 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4948 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4949 case Intrinsic::nvvm_suld_2d_i16_clamp: 4950 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4951 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4952 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4953 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4954 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4955 case Intrinsic::nvvm_suld_3d_i16_clamp: 4956 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4957 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4958 case Intrinsic::nvvm_suld_1d_i16_trap: 4959 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4960 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4961 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4962 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4963 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4964 case Intrinsic::nvvm_suld_2d_i16_trap: 4965 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4966 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4967 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4968 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4969 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4970 case Intrinsic::nvvm_suld_3d_i16_trap: 4971 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4972 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4973 case Intrinsic::nvvm_suld_1d_i16_zero: 4974 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4975 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4976 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4977 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4978 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4979 case Intrinsic::nvvm_suld_2d_i16_zero: 4980 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4981 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4982 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4983 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4984 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4985 case Intrinsic::nvvm_suld_3d_i16_zero: 4986 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4987 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4988 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4989 Info.memVT = MVT::i16; 4990 Info.ptrVal = nullptr; 4991 Info.offset = 0; 4992 Info.flags = MachineMemOperand::MOLoad; 4993 Info.align = Align(16); 4994 return true; 4995 4996 case Intrinsic::nvvm_suld_1d_i32_clamp: 4997 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4998 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4999 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 5000 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 5001 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 5002 case Intrinsic::nvvm_suld_2d_i32_clamp: 5003 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 5004 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 5005 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 5006 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 5007 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 5008 case Intrinsic::nvvm_suld_3d_i32_clamp: 5009 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 5010 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 5011 case Intrinsic::nvvm_suld_1d_i32_trap: 5012 case Intrinsic::nvvm_suld_1d_v2i32_trap: 5013 case Intrinsic::nvvm_suld_1d_v4i32_trap: 5014 case Intrinsic::nvvm_suld_1d_array_i32_trap: 5015 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 5016 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 5017 case Intrinsic::nvvm_suld_2d_i32_trap: 5018 case Intrinsic::nvvm_suld_2d_v2i32_trap: 5019 case Intrinsic::nvvm_suld_2d_v4i32_trap: 5020 case Intrinsic::nvvm_suld_2d_array_i32_trap: 5021 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 5022 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 5023 case Intrinsic::nvvm_suld_3d_i32_trap: 5024 case Intrinsic::nvvm_suld_3d_v2i32_trap: 5025 case Intrinsic::nvvm_suld_3d_v4i32_trap: 5026 case Intrinsic::nvvm_suld_1d_i32_zero: 5027 case Intrinsic::nvvm_suld_1d_v2i32_zero: 5028 case Intrinsic::nvvm_suld_1d_v4i32_zero: 5029 case Intrinsic::nvvm_suld_1d_array_i32_zero: 5030 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 5031 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 5032 case Intrinsic::nvvm_suld_2d_i32_zero: 5033 case Intrinsic::nvvm_suld_2d_v2i32_zero: 5034 case Intrinsic::nvvm_suld_2d_v4i32_zero: 5035 case Intrinsic::nvvm_suld_2d_array_i32_zero: 5036 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 5037 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 5038 case Intrinsic::nvvm_suld_3d_i32_zero: 5039 case Intrinsic::nvvm_suld_3d_v2i32_zero: 5040 case Intrinsic::nvvm_suld_3d_v4i32_zero: 5041 Info.opc = getOpcForSurfaceInstr(Intrinsic); 5042 Info.memVT = MVT::i32; 5043 Info.ptrVal = nullptr; 5044 Info.offset = 0; 5045 Info.flags = MachineMemOperand::MOLoad; 5046 Info.align = Align(16); 5047 return true; 5048 5049 case Intrinsic::nvvm_suld_1d_i64_clamp: 5050 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 5051 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 5052 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 5053 case Intrinsic::nvvm_suld_2d_i64_clamp: 5054 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 5055 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 5056 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 5057 case Intrinsic::nvvm_suld_3d_i64_clamp: 5058 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 5059 case Intrinsic::nvvm_suld_1d_i64_trap: 5060 case Intrinsic::nvvm_suld_1d_v2i64_trap: 5061 case Intrinsic::nvvm_suld_1d_array_i64_trap: 5062 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 5063 case Intrinsic::nvvm_suld_2d_i64_trap: 5064 case Intrinsic::nvvm_suld_2d_v2i64_trap: 5065 case Intrinsic::nvvm_suld_2d_array_i64_trap: 5066 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 5067 case Intrinsic::nvvm_suld_3d_i64_trap: 5068 case Intrinsic::nvvm_suld_3d_v2i64_trap: 5069 case Intrinsic::nvvm_suld_1d_i64_zero: 5070 case Intrinsic::nvvm_suld_1d_v2i64_zero: 5071 case Intrinsic::nvvm_suld_1d_array_i64_zero: 5072 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 5073 case Intrinsic::nvvm_suld_2d_i64_zero: 5074 case Intrinsic::nvvm_suld_2d_v2i64_zero: 5075 case Intrinsic::nvvm_suld_2d_array_i64_zero: 5076 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 5077 case Intrinsic::nvvm_suld_3d_i64_zero: 5078 case Intrinsic::nvvm_suld_3d_v2i64_zero: 5079 Info.opc = getOpcForSurfaceInstr(Intrinsic); 5080 Info.memVT = MVT::i64; 5081 Info.ptrVal = nullptr; 5082 Info.offset = 0; 5083 Info.flags = MachineMemOperand::MOLoad; 5084 Info.align = Align(16); 5085 return true; 5086 } 5087 return false; 5088 } 5089 5090 /// getFunctionParamOptimizedAlign - since function arguments are passed via 5091 /// .param space, we may want to increase their alignment in a way that 5092 /// ensures that we can effectively vectorize their loads & stores. We can 5093 /// increase alignment only if the function has internal or has private 5094 /// linkage as for other linkage types callers may already rely on default 5095 /// alignment. To allow using 128-bit vectorized loads/stores, this function 5096 /// ensures that alignment is 16 or greater. 5097 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 5098 const Function *F, Type *ArgTy, const DataLayout &DL) const { 5099 // Capping the alignment to 128 bytes as that is the maximum alignment 5100 // supported by PTX. 5101 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy)); 5102 5103 // If a function has linkage different from internal or private, we 5104 // must use default ABI alignment as external users rely on it. Same 5105 // for a function that may be called from a function pointer. 5106 if (!F || !F->hasLocalLinkage() || 5107 F->hasAddressTaken(/*Users=*/nullptr, 5108 /*IgnoreCallbackUses=*/false, 5109 /*IgnoreAssumeLikeCalls=*/true, 5110 /*IgnoreLLVMUsed=*/true)) 5111 return ABITypeAlign; 5112 5113 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 5114 return std::max(Align(16), ABITypeAlign); 5115 } 5116 5117 /// Helper for computing alignment of a device function byval parameter. 5118 Align NVPTXTargetLowering::getFunctionByValParamAlign( 5119 const Function *F, Type *ArgTy, Align InitialAlign, 5120 const DataLayout &DL) const { 5121 Align ArgAlign = InitialAlign; 5122 // Try to increase alignment to enhance vectorization options. 5123 if (F) 5124 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 5125 5126 // Old ptx versions have a bug. When PTX code takes address of 5127 // byval parameter with alignment < 4, ptxas generates code to 5128 // spill argument into memory. Alas on sm_50+ ptxas generates 5129 // SASS code that fails with misaligned access. To work around 5130 // the problem, make sure that we align byval parameters by at 5131 // least 4. This bug seems to be fixed at least starting from 5132 // ptxas > 9.0. 5133 // TODO: remove this after verifying the bug is not reproduced 5134 // on non-deprecated ptxas versions. 5135 if (ForceMinByValParamAlign) 5136 ArgAlign = std::max(ArgAlign, Align(4)); 5137 5138 return ArgAlign; 5139 } 5140 5141 // Helper for getting a function parameter name. Name is composed from 5142 // its index and the function name. Negative index corresponds to special 5143 // parameter (unsized array) used for passing variable arguments. 5144 std::string NVPTXTargetLowering::getParamName(const Function *F, 5145 int Idx) const { 5146 std::string ParamName; 5147 raw_string_ostream ParamStr(ParamName); 5148 5149 ParamStr << getTargetMachine().getSymbol(F)->getName(); 5150 if (Idx < 0) 5151 ParamStr << "_vararg"; 5152 else 5153 ParamStr << "_param_" << Idx; 5154 5155 return ParamName; 5156 } 5157 5158 /// isLegalAddressingMode - Return true if the addressing mode represented 5159 /// by AM is legal for this target, for a load/store of the specified type. 5160 /// Used to guide target specific optimizations, like loop strength reduction 5161 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 5162 /// (CodeGenPrepare.cpp) 5163 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 5164 const AddrMode &AM, Type *Ty, 5165 unsigned AS, Instruction *I) const { 5166 // AddrMode - This represents an addressing mode of: 5167 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 5168 // 5169 // The legal address modes are 5170 // - [avar] 5171 // - [areg] 5172 // - [areg+immoff] 5173 // - [immAddr] 5174 5175 // immoff must fit in a signed 32-bit int 5176 if (!APInt(64, AM.BaseOffs).isSignedIntN(32)) 5177 return false; 5178 5179 if (AM.BaseGV) 5180 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 5181 5182 switch (AM.Scale) { 5183 case 0: // "r", "r+i" or "i" is allowed 5184 break; 5185 case 1: 5186 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 5187 return false; 5188 // Otherwise we have r+i. 5189 break; 5190 default: 5191 // No scale > 1 is allowed 5192 return false; 5193 } 5194 return true; 5195 } 5196 5197 //===----------------------------------------------------------------------===// 5198 // NVPTX Inline Assembly Support 5199 //===----------------------------------------------------------------------===// 5200 5201 /// getConstraintType - Given a constraint letter, return the type of 5202 /// constraint it is for this target. 5203 NVPTXTargetLowering::ConstraintType 5204 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 5205 if (Constraint.size() == 1) { 5206 switch (Constraint[0]) { 5207 default: 5208 break; 5209 case 'b': 5210 case 'r': 5211 case 'h': 5212 case 'c': 5213 case 'l': 5214 case 'f': 5215 case 'd': 5216 case 'q': 5217 case '0': 5218 case 'N': 5219 return C_RegisterClass; 5220 } 5221 } 5222 return TargetLowering::getConstraintType(Constraint); 5223 } 5224 5225 std::pair<unsigned, const TargetRegisterClass *> 5226 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 5227 StringRef Constraint, 5228 MVT VT) const { 5229 if (Constraint.size() == 1) { 5230 switch (Constraint[0]) { 5231 case 'b': 5232 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 5233 case 'c': 5234 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5235 case 'h': 5236 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5237 case 'r': 5238 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 5239 case 'l': 5240 case 'N': 5241 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 5242 case 'q': { 5243 if (STI.getSmVersion() < 70) 5244 report_fatal_error("Inline asm with 128 bit operands is only " 5245 "supported for sm_70 and higher!"); 5246 return std::make_pair(0U, &NVPTX::Int128RegsRegClass); 5247 } 5248 case 'f': 5249 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 5250 case 'd': 5251 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 5252 } 5253 } 5254 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5255 } 5256 5257 //===----------------------------------------------------------------------===// 5258 // NVPTX DAG Combining 5259 //===----------------------------------------------------------------------===// 5260 5261 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 5262 CodeGenOptLevel OptLevel) const { 5263 // Always honor command-line argument 5264 if (FMAContractLevelOpt.getNumOccurrences() > 0) 5265 return FMAContractLevelOpt > 0; 5266 5267 // Do not contract if we're not optimizing the code. 5268 if (OptLevel == CodeGenOptLevel::None) 5269 return false; 5270 5271 // Honor TargetOptions flags that explicitly say fusion is okay. 5272 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 5273 return true; 5274 5275 return allowUnsafeFPMath(MF); 5276 } 5277 5278 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 5279 // Honor TargetOptions flags that explicitly say unsafe math is okay. 5280 if (MF.getTarget().Options.UnsafeFPMath) 5281 return true; 5282 5283 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 5284 const Function &F = MF.getFunction(); 5285 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 5286 } 5287 5288 static bool isConstZero(const SDValue &Operand) { 5289 const auto *Const = dyn_cast<ConstantSDNode>(Operand); 5290 return Const && Const->getZExtValue() == 0; 5291 } 5292 5293 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 5294 /// operands N0 and N1. This is a helper for PerformADDCombine that is 5295 /// called with the default operands, and if that fails, with commuted 5296 /// operands. 5297 static SDValue 5298 PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5299 TargetLowering::DAGCombinerInfo &DCI) { 5300 EVT VT = N0.getValueType(); 5301 5302 // Since integer multiply-add costs the same as integer multiply 5303 // but is more costly than integer add, do the fusion only when 5304 // the mul is only used in the add. 5305 // TODO: this may not be true for later architectures, consider relaxing this 5306 if (!N0.getNode()->hasOneUse()) 5307 return SDValue(); 5308 5309 // fold (add (mul a, b), c) -> (mad a, b, c) 5310 // 5311 if (N0.getOpcode() == ISD::MUL) 5312 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0), 5313 N0.getOperand(1), N1); 5314 5315 // fold (add (select cond, 0, (mul a, b)), c) 5316 // -> (select cond, c, (mad a, b, c)) 5317 // 5318 if (N0.getOpcode() == ISD::SELECT) { 5319 unsigned ZeroOpNum; 5320 if (isConstZero(N0->getOperand(1))) 5321 ZeroOpNum = 1; 5322 else if (isConstZero(N0->getOperand(2))) 5323 ZeroOpNum = 2; 5324 else 5325 return SDValue(); 5326 5327 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1); 5328 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) 5329 return SDValue(); 5330 5331 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 5332 M->getOperand(0), M->getOperand(1), N1); 5333 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0), 5334 ((ZeroOpNum == 1) ? N1 : MAD), 5335 ((ZeroOpNum == 1) ? MAD : N1)); 5336 } 5337 5338 return SDValue(); 5339 } 5340 5341 static SDValue 5342 PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5343 TargetLowering::DAGCombinerInfo &DCI, 5344 CodeGenOptLevel OptLevel) { 5345 EVT VT = N0.getValueType(); 5346 if (N0.getOpcode() == ISD::FMUL) { 5347 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 5348 &DCI.DAG.getTargetLoweringInfo()); 5349 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel)) 5350 return SDValue(); 5351 5352 // For floating point: 5353 // Do the fusion only when the mul has less than 5 uses and all 5354 // are add. 5355 // The heuristic is that if a use is not an add, then that use 5356 // cannot be fused into fma, therefore mul is still needed anyway. 5357 // If there are more than 4 uses, even if they are all add, fusing 5358 // them will increase register pressue. 5359 // 5360 int numUses = 0; 5361 int nonAddCount = 0; 5362 for (const SDNode *User : N0.getNode()->uses()) { 5363 numUses++; 5364 if (User->getOpcode() != ISD::FADD) 5365 ++nonAddCount; 5366 if (numUses >= 5) 5367 return SDValue(); 5368 } 5369 if (nonAddCount) { 5370 int orderNo = N->getIROrder(); 5371 int orderNo2 = N0.getNode()->getIROrder(); 5372 // simple heuristics here for considering potential register 5373 // pressure, the logics here is that the differnce are used 5374 // to measure the distance between def and use, the longer distance 5375 // more likely cause register pressure. 5376 if (orderNo - orderNo2 < 500) 5377 return SDValue(); 5378 5379 // Now, check if at least one of the FMUL's operands is live beyond the 5380 // node N, which guarantees that the FMA will not increase register 5381 // pressure at node N. 5382 bool opIsLive = false; 5383 const SDNode *left = N0.getOperand(0).getNode(); 5384 const SDNode *right = N0.getOperand(1).getNode(); 5385 5386 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 5387 opIsLive = true; 5388 5389 if (!opIsLive) 5390 for (const SDNode *User : left->uses()) { 5391 int orderNo3 = User->getIROrder(); 5392 if (orderNo3 > orderNo) { 5393 opIsLive = true; 5394 break; 5395 } 5396 } 5397 5398 if (!opIsLive) 5399 for (const SDNode *User : right->uses()) { 5400 int orderNo3 = User->getIROrder(); 5401 if (orderNo3 > orderNo) { 5402 opIsLive = true; 5403 break; 5404 } 5405 } 5406 5407 if (!opIsLive) 5408 return SDValue(); 5409 } 5410 5411 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), 5412 N0.getOperand(1), N1); 5413 } 5414 5415 return SDValue(); 5416 } 5417 5418 static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, 5419 std::size_t Back) { 5420 if (all_of(N->ops().drop_front(Front).drop_back(Back), 5421 [](const SDUse &U) { return U.get()->isUndef(); })) 5422 // Operand 0 is the previous value in the chain. Cannot return EntryToken 5423 // as the previous value will become unused and eliminated later. 5424 return N->getOperand(0); 5425 5426 return SDValue(); 5427 } 5428 5429 static SDValue PerformStoreParamCombine(SDNode *N) { 5430 // Operands from the 3rd to the 2nd last one are the values to be stored. 5431 // {Chain, ArgID, Offset, Val, Glue} 5432 return PerformStoreCombineHelper(N, 3, 1); 5433 } 5434 5435 static SDValue PerformStoreRetvalCombine(SDNode *N) { 5436 // Operands from the 2nd to the last one are the values to be stored 5437 return PerformStoreCombineHelper(N, 2, 0); 5438 } 5439 5440 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 5441 /// 5442 static SDValue PerformADDCombine(SDNode *N, 5443 TargetLowering::DAGCombinerInfo &DCI, 5444 CodeGenOptLevel OptLevel) { 5445 if (OptLevel == CodeGenOptLevel::None) 5446 return SDValue(); 5447 5448 SDValue N0 = N->getOperand(0); 5449 SDValue N1 = N->getOperand(1); 5450 5451 // Skip non-integer, non-scalar case 5452 EVT VT = N0.getValueType(); 5453 if (VT.isVector() || VT != MVT::i32) 5454 return SDValue(); 5455 5456 // First try with the default operand order. 5457 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI)) 5458 return Result; 5459 5460 // If that didn't work, try again with the operands commuted. 5461 return PerformADDCombineWithOperands(N, N1, N0, DCI); 5462 } 5463 5464 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD. 5465 /// 5466 static SDValue PerformFADDCombine(SDNode *N, 5467 TargetLowering::DAGCombinerInfo &DCI, 5468 CodeGenOptLevel OptLevel) { 5469 SDValue N0 = N->getOperand(0); 5470 SDValue N1 = N->getOperand(1); 5471 5472 EVT VT = N0.getValueType(); 5473 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) 5474 return SDValue(); 5475 5476 // First try with the default operand order. 5477 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel)) 5478 return Result; 5479 5480 // If that didn't work, try again with the operands commuted. 5481 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel); 5482 } 5483 5484 static SDValue PerformANDCombine(SDNode *N, 5485 TargetLowering::DAGCombinerInfo &DCI) { 5486 // The type legalizer turns a vector load of i8 values into a zextload to i16 5487 // registers, optionally ANY_EXTENDs it (if target type is integer), 5488 // and ANDs off the high 8 bits. Since we turn this load into a 5489 // target-specific DAG node, the DAG combiner fails to eliminate these AND 5490 // nodes. Do that here. 5491 SDValue Val = N->getOperand(0); 5492 SDValue Mask = N->getOperand(1); 5493 5494 if (isa<ConstantSDNode>(Val)) { 5495 std::swap(Val, Mask); 5496 } 5497 5498 SDValue AExt; 5499 5500 // Convert BFE-> truncate i16 -> and 255 5501 // To just BFE-> truncate i16, as the value already has all the bits in the 5502 // right places. 5503 if (Val.getOpcode() == ISD::TRUNCATE) { 5504 SDValue BFE = Val.getOperand(0); 5505 if (BFE.getOpcode() != NVPTXISD::BFE) 5506 return SDValue(); 5507 5508 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); 5509 if (!BFEBits) 5510 return SDValue(); 5511 uint64_t BFEBitsVal = BFEBits->getZExtValue(); 5512 5513 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5514 if (!MaskCnst) { 5515 // Not an AND with a constant 5516 return SDValue(); 5517 } 5518 uint64_t MaskVal = MaskCnst->getZExtValue(); 5519 5520 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) 5521 return SDValue(); 5522 // If we get here, the AND is unnecessary. Just replace it with the trunc 5523 DCI.CombineTo(N, Val, false); 5524 } 5525 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 5526 if (Val.getOpcode() == ISD::ANY_EXTEND) { 5527 AExt = Val; 5528 Val = Val->getOperand(0); 5529 } 5530 5531 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 5532 Val = Val->getOperand(0); 5533 } 5534 5535 if (Val->getOpcode() == NVPTXISD::LoadV2 || 5536 Val->getOpcode() == NVPTXISD::LoadV4) { 5537 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5538 if (!MaskCnst) { 5539 // Not an AND with a constant 5540 return SDValue(); 5541 } 5542 5543 uint64_t MaskVal = MaskCnst->getZExtValue(); 5544 if (MaskVal != 0xff) { 5545 // Not an AND that chops off top 8 bits 5546 return SDValue(); 5547 } 5548 5549 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 5550 if (!Mem) { 5551 // Not a MemSDNode?!? 5552 return SDValue(); 5553 } 5554 5555 EVT MemVT = Mem->getMemoryVT(); 5556 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 5557 // We only handle the i8 case 5558 return SDValue(); 5559 } 5560 5561 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1); 5562 if (ExtType == ISD::SEXTLOAD) { 5563 // If for some reason the load is a sextload, the and is needed to zero 5564 // out the high 8 bits 5565 return SDValue(); 5566 } 5567 5568 bool AddTo = false; 5569 if (AExt.getNode() != nullptr) { 5570 // Re-insert the ext as a zext. 5571 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5572 AExt.getValueType(), Val); 5573 AddTo = true; 5574 } 5575 5576 // If we get here, the AND is unnecessary. Just replace it with the load 5577 DCI.CombineTo(N, Val, AddTo); 5578 } 5579 5580 return SDValue(); 5581 } 5582 5583 static SDValue PerformREMCombine(SDNode *N, 5584 TargetLowering::DAGCombinerInfo &DCI, 5585 CodeGenOptLevel OptLevel) { 5586 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 5587 5588 // Don't do anything at less than -O2. 5589 if (OptLevel < CodeGenOptLevel::Default) 5590 return SDValue(); 5591 5592 SelectionDAG &DAG = DCI.DAG; 5593 SDLoc DL(N); 5594 EVT VT = N->getValueType(0); 5595 bool IsSigned = N->getOpcode() == ISD::SREM; 5596 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 5597 5598 const SDValue &Num = N->getOperand(0); 5599 const SDValue &Den = N->getOperand(1); 5600 5601 for (const SDNode *U : Num->uses()) { 5602 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 5603 U->getOperand(1) == Den) { 5604 // Num % Den -> Num - (Num / Den) * Den 5605 return DAG.getNode(ISD::SUB, DL, VT, Num, 5606 DAG.getNode(ISD::MUL, DL, VT, 5607 DAG.getNode(DivOpc, DL, VT, Num, Den), 5608 Den)); 5609 } 5610 } 5611 return SDValue(); 5612 } 5613 5614 enum OperandSignedness { 5615 Signed = 0, 5616 Unsigned, 5617 Unknown 5618 }; 5619 5620 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 5621 /// that can be demoted to \p OptSize bits without loss of information. The 5622 /// signedness of the operand, if determinable, is placed in \p S. 5623 static bool IsMulWideOperandDemotable(SDValue Op, 5624 unsigned OptSize, 5625 OperandSignedness &S) { 5626 S = Unknown; 5627 5628 if (Op.getOpcode() == ISD::SIGN_EXTEND || 5629 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 5630 EVT OrigVT = Op.getOperand(0).getValueType(); 5631 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5632 S = Signed; 5633 return true; 5634 } 5635 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 5636 EVT OrigVT = Op.getOperand(0).getValueType(); 5637 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5638 S = Unsigned; 5639 return true; 5640 } 5641 } 5642 5643 return false; 5644 } 5645 5646 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 5647 /// be demoted to \p OptSize bits without loss of information. If the operands 5648 /// contain a constant, it should appear as the RHS operand. The signedness of 5649 /// the operands is placed in \p IsSigned. 5650 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 5651 unsigned OptSize, 5652 bool &IsSigned) { 5653 OperandSignedness LHSSign; 5654 5655 // The LHS operand must be a demotable op 5656 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5657 return false; 5658 5659 // We should have been able to determine the signedness from the LHS 5660 if (LHSSign == Unknown) 5661 return false; 5662 5663 IsSigned = (LHSSign == Signed); 5664 5665 // The RHS can be a demotable op or a constant 5666 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5667 const APInt &Val = CI->getAPIntValue(); 5668 if (LHSSign == Unsigned) { 5669 return Val.isIntN(OptSize); 5670 } else { 5671 return Val.isSignedIntN(OptSize); 5672 } 5673 } else { 5674 OperandSignedness RHSSign; 5675 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5676 return false; 5677 5678 return LHSSign == RHSSign; 5679 } 5680 } 5681 5682 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5683 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5684 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5685 /// amount. 5686 static SDValue TryMULWIDECombine(SDNode *N, 5687 TargetLowering::DAGCombinerInfo &DCI) { 5688 EVT MulType = N->getValueType(0); 5689 if (MulType != MVT::i32 && MulType != MVT::i64) { 5690 return SDValue(); 5691 } 5692 5693 SDLoc DL(N); 5694 unsigned OptSize = MulType.getSizeInBits() >> 1; 5695 SDValue LHS = N->getOperand(0); 5696 SDValue RHS = N->getOperand(1); 5697 5698 // Canonicalize the multiply so the constant (if any) is on the right 5699 if (N->getOpcode() == ISD::MUL) { 5700 if (isa<ConstantSDNode>(LHS)) { 5701 std::swap(LHS, RHS); 5702 } 5703 } 5704 5705 // If we have a SHL, determine the actual multiply amount 5706 if (N->getOpcode() == ISD::SHL) { 5707 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5708 if (!ShlRHS) { 5709 return SDValue(); 5710 } 5711 5712 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5713 unsigned BitWidth = MulType.getSizeInBits(); 5714 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5715 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5716 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5717 } else { 5718 return SDValue(); 5719 } 5720 } 5721 5722 bool Signed; 5723 // Verify that our operands are demotable 5724 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5725 return SDValue(); 5726 } 5727 5728 EVT DemotedVT; 5729 if (MulType == MVT::i32) { 5730 DemotedVT = MVT::i16; 5731 } else { 5732 DemotedVT = MVT::i32; 5733 } 5734 5735 // Truncate the operands to the correct size. Note that these are just for 5736 // type consistency and will (likely) be eliminated in later phases. 5737 SDValue TruncLHS = 5738 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5739 SDValue TruncRHS = 5740 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5741 5742 unsigned Opc; 5743 if (Signed) { 5744 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5745 } else { 5746 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5747 } 5748 5749 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5750 } 5751 5752 static bool isConstOne(const SDValue &Operand) { 5753 const auto *Const = dyn_cast<ConstantSDNode>(Operand); 5754 return Const && Const->getZExtValue() == 1; 5755 } 5756 5757 static SDValue matchMADConstOnePattern(SDValue Add) { 5758 if (Add->getOpcode() != ISD::ADD) 5759 return SDValue(); 5760 5761 if (isConstOne(Add->getOperand(0))) 5762 return Add->getOperand(1); 5763 5764 if (isConstOne(Add->getOperand(1))) 5765 return Add->getOperand(0); 5766 5767 return SDValue(); 5768 } 5769 5770 static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, 5771 TargetLowering::DAGCombinerInfo &DCI) { 5772 5773 if (SDValue Y = matchMADConstOnePattern(Add)) 5774 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X); 5775 5776 return SDValue(); 5777 } 5778 5779 static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, 5780 SDLoc DL, 5781 TargetLowering::DAGCombinerInfo &DCI) { 5782 if (Select->getOpcode() != ISD::SELECT) 5783 return SDValue(); 5784 5785 SDValue Cond = Select->getOperand(0); 5786 5787 unsigned ConstOpNo; 5788 if (isConstOne(Select->getOperand(1))) 5789 ConstOpNo = 1; 5790 else if (isConstOne(Select->getOperand(2))) 5791 ConstOpNo = 2; 5792 else 5793 return SDValue(); 5794 5795 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1); 5796 5797 // Do not combine if the resulting sequence is not obviously profitable. 5798 if (!matchMADConstOnePattern(Y)) 5799 return SDValue(); 5800 5801 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y); 5802 5803 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond, 5804 (ConstOpNo == 1) ? X : NewMul, 5805 (ConstOpNo == 1) ? NewMul : X); 5806 } 5807 5808 static SDValue 5809 PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5810 TargetLowering::DAGCombinerInfo &DCI) { 5811 5812 EVT VT = N0.getValueType(); 5813 if (VT.isVector()) 5814 return SDValue(); 5815 5816 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 5817 return SDValue(); 5818 5819 SDLoc DL(N); 5820 5821 // (mul x, (add y, 1)) -> (mad x, y, x) 5822 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI)) 5823 return Res; 5824 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI)) 5825 return Res; 5826 5827 // (mul x, (select y, 1)) -> (select (mul x, y), x) 5828 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI)) 5829 return Res; 5830 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI)) 5831 return Res; 5832 5833 return SDValue(); 5834 } 5835 5836 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5837 static SDValue PerformMULCombine(SDNode *N, 5838 TargetLowering::DAGCombinerInfo &DCI, 5839 CodeGenOptLevel OptLevel) { 5840 if (OptLevel == CodeGenOptLevel::None) 5841 return SDValue(); 5842 5843 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5844 return Ret; 5845 5846 SDValue N0 = N->getOperand(0); 5847 SDValue N1 = N->getOperand(1); 5848 return PerformMULCombineWithOperands(N, N0, N1, DCI); 5849 } 5850 5851 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5852 static SDValue PerformSHLCombine(SDNode *N, 5853 TargetLowering::DAGCombinerInfo &DCI, 5854 CodeGenOptLevel OptLevel) { 5855 if (OptLevel > CodeGenOptLevel::None) { 5856 // Try mul.wide combining at OptLevel > 0 5857 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5858 return Ret; 5859 } 5860 5861 return SDValue(); 5862 } 5863 5864 static SDValue PerformSETCCCombine(SDNode *N, 5865 TargetLowering::DAGCombinerInfo &DCI, 5866 unsigned int SmVersion) { 5867 EVT CCType = N->getValueType(0); 5868 SDValue A = N->getOperand(0); 5869 SDValue B = N->getOperand(1); 5870 5871 EVT AType = A.getValueType(); 5872 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) 5873 return SDValue(); 5874 5875 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) 5876 return SDValue(); 5877 5878 SDLoc DL(N); 5879 // setp.f16x2 returns two scalar predicates, which we need to 5880 // convert back to v2i1. The returned result will be scalarized by 5881 // the legalizer, but the comparison will remain a single vector 5882 // instruction. 5883 SDValue CCNode = DCI.DAG.getNode( 5884 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 5885 : NVPTXISD::SETP_BF16X2, 5886 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); 5887 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5888 CCNode.getValue(1)); 5889 } 5890 5891 static SDValue PerformEXTRACTCombine(SDNode *N, 5892 TargetLowering::DAGCombinerInfo &DCI) { 5893 SDValue Vector = N->getOperand(0); 5894 SDLoc DL(N); 5895 EVT VectorVT = Vector.getValueType(); 5896 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && 5897 IsPTXVectorType(VectorVT.getSimpleVT())) 5898 return SDValue(); // Native vector loads already combine nicely w/ 5899 // extract_vector_elt. 5900 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already 5901 // handle them OK. 5902 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || 5903 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) 5904 return SDValue(); 5905 5906 // Don't mess with undef values as sra may be simplified to 0, not undef. 5907 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode())) 5908 return SDValue(); 5909 5910 uint64_t VectorBits = VectorVT.getSizeInBits(); 5911 // We only handle the types we can extract in-register. 5912 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) 5913 return SDValue(); 5914 5915 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5916 // Index == 0 is handled by generic DAG combiner. 5917 if (!Index || Index->getZExtValue() == 0) 5918 return SDValue(); 5919 5920 MVT IVT = MVT::getIntegerVT(VectorBits); 5921 EVT EltVT = VectorVT.getVectorElementType(); 5922 EVT EltIVT = EltVT.changeTypeToInteger(); 5923 uint64_t EltBits = EltVT.getScalarSizeInBits(); 5924 5925 SDValue Result = DCI.DAG.getNode( 5926 ISD::TRUNCATE, DL, EltIVT, 5927 DCI.DAG.getNode( 5928 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector), 5929 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); 5930 5931 // If element has non-integer type, bitcast it back to the expected type. 5932 if (EltVT != EltIVT) 5933 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); 5934 // Past legalizer, we may need to extent i8 -> i16 to match the register type. 5935 if (EltVT != N->getValueType(0)) 5936 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); 5937 5938 return Result; 5939 } 5940 5941 static SDValue PerformVSELECTCombine(SDNode *N, 5942 TargetLowering::DAGCombinerInfo &DCI) { 5943 SDValue VA = N->getOperand(1); 5944 EVT VectorVT = VA.getValueType(); 5945 if (VectorVT != MVT::v4i8) 5946 return SDValue(); 5947 5948 // We need to split vselect into individual per-element operations Because we 5949 // use BFE/BFI instruction for byte extraction/insertion, we do end up with 5950 // 32-bit values, so we may as well do comparison as i32 to avoid conversions 5951 // to/from i16 normally used for i8 values. 5952 SmallVector<SDValue, 4> E; 5953 SDLoc DL(N); 5954 SDValue VCond = N->getOperand(0); 5955 SDValue VB = N->getOperand(2); 5956 for (int I = 0; I < 4; ++I) { 5957 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, 5958 DCI.DAG.getConstant(I, DL, MVT::i32)); 5959 SDValue EA = DCI.DAG.getAnyExtOrTrunc( 5960 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, 5961 DCI.DAG.getConstant(I, DL, MVT::i32)), 5962 DL, MVT::i32); 5963 SDValue EB = DCI.DAG.getAnyExtOrTrunc( 5964 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, 5965 DCI.DAG.getConstant(I, DL, MVT::i32)), 5966 DL, MVT::i32); 5967 E.push_back(DCI.DAG.getAnyExtOrTrunc( 5968 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); 5969 } 5970 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); 5971 } 5972 5973 static SDValue PerformLOADCombine(SDNode *N, 5974 TargetLowering::DAGCombinerInfo &DCI) { 5975 SelectionDAG &DAG = DCI.DAG; 5976 LoadSDNode *LD = cast<LoadSDNode>(N); 5977 5978 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of 5979 // letting ReplaceLoadVector split it into smaller loads during legalization. 5980 // This is done at dag-combine1 time, so that vector operations with i8 5981 // elements can be optimised away instead of being needlessly split during 5982 // legalization, which involves storing to the stack and loading it back. 5983 EVT VT = N->getValueType(0); 5984 if (VT != MVT::v16i8) 5985 return SDValue(); 5986 5987 SDLoc DL(N); 5988 5989 // Create a v4i32 vector load operation, effectively <4 x v4i8>. 5990 unsigned Opc = NVPTXISD::LoadV4; 5991 EVT NewVT = MVT::v4i32; 5992 EVT EltVT = NewVT.getVectorElementType(); 5993 unsigned NumElts = NewVT.getVectorNumElements(); 5994 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; 5995 SDVTList RetVTList = DAG.getVTList(RetVTs); 5996 SmallVector<SDValue, 8> Ops(N->ops()); 5997 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5998 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, 5999 LD->getMemOperand()); 6000 SDValue NewChain = NewLoad.getValue(NumElts); 6001 6002 // Create a vector of the same type returned by the original load. 6003 SmallVector<SDValue, 4> Elts; 6004 for (unsigned i = 0; i < NumElts; i++) 6005 Elts.push_back(NewLoad.getValue(i)); 6006 return DCI.DAG.getMergeValues( 6007 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), 6008 NewChain}, 6009 DL); 6010 } 6011 6012 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 6013 DAGCombinerInfo &DCI) const { 6014 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); 6015 switch (N->getOpcode()) { 6016 default: break; 6017 case ISD::ADD: 6018 return PerformADDCombine(N, DCI, OptLevel); 6019 case ISD::FADD: 6020 return PerformFADDCombine(N, DCI, OptLevel); 6021 case ISD::MUL: 6022 return PerformMULCombine(N, DCI, OptLevel); 6023 case ISD::SHL: 6024 return PerformSHLCombine(N, DCI, OptLevel); 6025 case ISD::AND: 6026 return PerformANDCombine(N, DCI); 6027 case ISD::UREM: 6028 case ISD::SREM: 6029 return PerformREMCombine(N, DCI, OptLevel); 6030 case ISD::SETCC: 6031 return PerformSETCCCombine(N, DCI, STI.getSmVersion()); 6032 case ISD::LOAD: 6033 return PerformLOADCombine(N, DCI); 6034 case NVPTXISD::StoreRetval: 6035 case NVPTXISD::StoreRetvalV2: 6036 case NVPTXISD::StoreRetvalV4: 6037 return PerformStoreRetvalCombine(N); 6038 case NVPTXISD::StoreParam: 6039 case NVPTXISD::StoreParamV2: 6040 case NVPTXISD::StoreParamV4: 6041 return PerformStoreParamCombine(N); 6042 case ISD::EXTRACT_VECTOR_ELT: 6043 return PerformEXTRACTCombine(N, DCI); 6044 case ISD::VSELECT: 6045 return PerformVSELECTCombine(N, DCI); 6046 } 6047 return SDValue(); 6048 } 6049 6050 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 6051 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 6052 SmallVectorImpl<SDValue> &Results) { 6053 EVT ResVT = N->getValueType(0); 6054 SDLoc DL(N); 6055 6056 assert(ResVT.isVector() && "Vector load must have vector type"); 6057 6058 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 6059 // legal. We can (and should) split that into 2 loads of <2 x double> here 6060 // but I'm leaving that as a TODO for now. 6061 assert(ResVT.isSimple() && "Can only handle simple types"); 6062 switch (ResVT.getSimpleVT().SimpleTy) { 6063 default: 6064 return; 6065 case MVT::v2i8: 6066 case MVT::v2i16: 6067 case MVT::v2i32: 6068 case MVT::v2i64: 6069 case MVT::v2f16: 6070 case MVT::v2f32: 6071 case MVT::v2f64: 6072 case MVT::v4i8: 6073 case MVT::v4i16: 6074 case MVT::v4i32: 6075 case MVT::v4f16: 6076 case MVT::v4f32: 6077 case MVT::v8f16: // <4 x f16x2> 6078 case MVT::v8bf16: // <4 x bf16x2> 6079 case MVT::v8i16: // <4 x i16x2> 6080 // This is a "native" vector type 6081 break; 6082 } 6083 6084 LoadSDNode *LD = cast<LoadSDNode>(N); 6085 6086 Align Alignment = LD->getAlign(); 6087 auto &TD = DAG.getDataLayout(); 6088 Align PrefAlign = 6089 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 6090 if (Alignment < PrefAlign) { 6091 // This load is not sufficiently aligned, so bail out and let this vector 6092 // load be scalarized. Note that we may still be able to emit smaller 6093 // vector loads. For example, if we are loading a <4 x float> with an 6094 // alignment of 8, this check will fail but the legalizer will try again 6095 // with 2 x <2 x float>, which will succeed with an alignment of 8. 6096 return; 6097 } 6098 6099 EVT EltVT = ResVT.getVectorElementType(); 6100 unsigned NumElts = ResVT.getVectorNumElements(); 6101 6102 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 6103 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 6104 // loaded type to i16 and propagate the "real" type as the memory type. 6105 bool NeedTrunc = false; 6106 if (EltVT.getSizeInBits() < 16) { 6107 EltVT = MVT::i16; 6108 NeedTrunc = true; 6109 } 6110 6111 unsigned Opcode = 0; 6112 SDVTList LdResVTs; 6113 bool Load16x2 = false; 6114 6115 switch (NumElts) { 6116 default: 6117 return; 6118 case 2: 6119 Opcode = NVPTXISD::LoadV2; 6120 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 6121 break; 6122 case 4: { 6123 Opcode = NVPTXISD::LoadV4; 6124 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 6125 LdResVTs = DAG.getVTList(ListVTs); 6126 break; 6127 } 6128 case 8: { 6129 // v8f16 is a special case. PTX doesn't have ld.v8.f16 6130 // instruction. Instead, we split the vector into v2f16 chunks and 6131 // load them with ld.v4.b32. 6132 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); 6133 Load16x2 = true; 6134 Opcode = NVPTXISD::LoadV4; 6135 EVT VVT; 6136 switch (EltVT.getSimpleVT().SimpleTy) { 6137 case MVT::f16: 6138 VVT = MVT::v2f16; 6139 break; 6140 case MVT::bf16: 6141 VVT = MVT::v2bf16; 6142 break; 6143 case MVT::i16: 6144 VVT = MVT::v2i16; 6145 break; 6146 default: 6147 llvm_unreachable("Unsupported v8 vector type."); 6148 } 6149 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 6150 LdResVTs = DAG.getVTList(ListVTs); 6151 break; 6152 } 6153 } 6154 6155 // Copy regular operands 6156 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 6157 6158 // The select routine does not have access to the LoadSDNode instance, so 6159 // pass along the extension information 6160 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 6161 6162 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 6163 LD->getMemoryVT(), 6164 LD->getMemOperand()); 6165 6166 SmallVector<SDValue, 8> ScalarRes; 6167 if (Load16x2) { 6168 // Split v2f16 subvectors back into individual elements. 6169 NumElts /= 2; 6170 for (unsigned i = 0; i < NumElts; ++i) { 6171 SDValue SubVector = NewLD.getValue(i); 6172 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 6173 DAG.getIntPtrConstant(0, DL)); 6174 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 6175 DAG.getIntPtrConstant(1, DL)); 6176 ScalarRes.push_back(E0); 6177 ScalarRes.push_back(E1); 6178 } 6179 } else { 6180 for (unsigned i = 0; i < NumElts; ++i) { 6181 SDValue Res = NewLD.getValue(i); 6182 if (NeedTrunc) 6183 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 6184 ScalarRes.push_back(Res); 6185 } 6186 } 6187 6188 SDValue LoadChain = NewLD.getValue(NumElts); 6189 6190 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 6191 6192 Results.push_back(BuildVec); 6193 Results.push_back(LoadChain); 6194 } 6195 6196 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 6197 SmallVectorImpl<SDValue> &Results) { 6198 SDValue Chain = N->getOperand(0); 6199 SDValue Intrin = N->getOperand(1); 6200 SDLoc DL(N); 6201 6202 // Get the intrinsic ID 6203 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); 6204 switch (IntrinNo) { 6205 default: 6206 return; 6207 case Intrinsic::nvvm_ldg_global_i: 6208 case Intrinsic::nvvm_ldg_global_f: 6209 case Intrinsic::nvvm_ldg_global_p: 6210 case Intrinsic::nvvm_ldu_global_i: 6211 case Intrinsic::nvvm_ldu_global_f: 6212 case Intrinsic::nvvm_ldu_global_p: { 6213 EVT ResVT = N->getValueType(0); 6214 6215 if (ResVT.isVector()) { 6216 // Vector LDG/LDU 6217 6218 unsigned NumElts = ResVT.getVectorNumElements(); 6219 EVT EltVT = ResVT.getVectorElementType(); 6220 6221 // Since LDU/LDG are target nodes, we cannot rely on DAG type 6222 // legalization. 6223 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 6224 // loaded type to i16 and propagate the "real" type as the memory type. 6225 bool NeedTrunc = false; 6226 if (EltVT.getSizeInBits() < 16) { 6227 EltVT = MVT::i16; 6228 NeedTrunc = true; 6229 } 6230 6231 unsigned Opcode = 0; 6232 SDVTList LdResVTs; 6233 6234 switch (NumElts) { 6235 default: 6236 return; 6237 case 2: 6238 switch (IntrinNo) { 6239 default: 6240 return; 6241 case Intrinsic::nvvm_ldg_global_i: 6242 case Intrinsic::nvvm_ldg_global_f: 6243 case Intrinsic::nvvm_ldg_global_p: 6244 Opcode = NVPTXISD::LDGV2; 6245 break; 6246 case Intrinsic::nvvm_ldu_global_i: 6247 case Intrinsic::nvvm_ldu_global_f: 6248 case Intrinsic::nvvm_ldu_global_p: 6249 Opcode = NVPTXISD::LDUV2; 6250 break; 6251 } 6252 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 6253 break; 6254 case 4: { 6255 switch (IntrinNo) { 6256 default: 6257 return; 6258 case Intrinsic::nvvm_ldg_global_i: 6259 case Intrinsic::nvvm_ldg_global_f: 6260 case Intrinsic::nvvm_ldg_global_p: 6261 Opcode = NVPTXISD::LDGV4; 6262 break; 6263 case Intrinsic::nvvm_ldu_global_i: 6264 case Intrinsic::nvvm_ldu_global_f: 6265 case Intrinsic::nvvm_ldu_global_p: 6266 Opcode = NVPTXISD::LDUV4; 6267 break; 6268 } 6269 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 6270 LdResVTs = DAG.getVTList(ListVTs); 6271 break; 6272 } 6273 } 6274 6275 SmallVector<SDValue, 8> OtherOps; 6276 6277 // Copy regular operands 6278 6279 OtherOps.push_back(Chain); // Chain 6280 // Skip operand 1 (intrinsic ID) 6281 // Others 6282 OtherOps.append(N->op_begin() + 2, N->op_end()); 6283 6284 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 6285 6286 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 6287 MemSD->getMemoryVT(), 6288 MemSD->getMemOperand()); 6289 6290 SmallVector<SDValue, 4> ScalarRes; 6291 6292 for (unsigned i = 0; i < NumElts; ++i) { 6293 SDValue Res = NewLD.getValue(i); 6294 if (NeedTrunc) 6295 Res = 6296 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 6297 ScalarRes.push_back(Res); 6298 } 6299 6300 SDValue LoadChain = NewLD.getValue(NumElts); 6301 6302 SDValue BuildVec = 6303 DAG.getBuildVector(ResVT, DL, ScalarRes); 6304 6305 Results.push_back(BuildVec); 6306 Results.push_back(LoadChain); 6307 } else { 6308 // i8 LDG/LDU 6309 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 6310 "Custom handling of non-i8 ldu/ldg?"); 6311 6312 // Just copy all operands as-is 6313 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 6314 6315 // Force output to i16 6316 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 6317 6318 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 6319 6320 // We make sure the memory type is i8, which will be used during isel 6321 // to select the proper instruction. 6322 SDValue NewLD = 6323 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 6324 MVT::i8, MemSD->getMemOperand()); 6325 6326 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 6327 NewLD.getValue(0))); 6328 Results.push_back(NewLD.getValue(1)); 6329 } 6330 } 6331 } 6332 } 6333 6334 static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, 6335 SmallVectorImpl<SDValue> &Results) { 6336 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit 6337 // result so that it can pass the legalization 6338 SDLoc DL(N); 6339 SDValue Chain = N->getOperand(0); 6340 SDValue Reg = N->getOperand(1); 6341 SDValue Glue = N->getOperand(2); 6342 6343 assert(Reg.getValueType() == MVT::i128 && 6344 "Custom lowering for CopyFromReg with 128-bit reg only"); 6345 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1), 6346 N->getValueType(2)}; 6347 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue}; 6348 6349 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps); 6350 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 6351 {NewValue.getValue(0), NewValue.getValue(1)}); 6352 6353 Results.push_back(Pair); 6354 Results.push_back(NewValue.getValue(2)); 6355 Results.push_back(NewValue.getValue(3)); 6356 } 6357 6358 void NVPTXTargetLowering::ReplaceNodeResults( 6359 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 6360 switch (N->getOpcode()) { 6361 default: 6362 report_fatal_error("Unhandled custom legalization"); 6363 case ISD::LOAD: 6364 ReplaceLoadVector(N, DAG, Results); 6365 return; 6366 case ISD::INTRINSIC_W_CHAIN: 6367 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 6368 return; 6369 case ISD::CopyFromReg: 6370 ReplaceCopyFromReg_128(N, DAG, Results); 6371 return; 6372 } 6373 } 6374 6375 NVPTXTargetLowering::AtomicExpansionKind 6376 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 6377 Type *Ty = AI->getValOperand()->getType(); 6378 6379 if (AI->isFloatingPointOperation()) { 6380 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 6381 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 && 6382 STI.getPTXVersion() >= 63) 6383 return AtomicExpansionKind::None; 6384 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 && 6385 STI.getPTXVersion() >= 78) 6386 return AtomicExpansionKind::None; 6387 if (Ty->isFloatTy()) 6388 return AtomicExpansionKind::None; 6389 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 6390 return AtomicExpansionKind::None; 6391 } 6392 return AtomicExpansionKind::CmpXChg; 6393 } 6394 6395 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 6396 auto ITy = cast<llvm::IntegerType>(Ty); 6397 6398 switch (AI->getOperation()) { 6399 default: 6400 return AtomicExpansionKind::CmpXChg; 6401 case AtomicRMWInst::BinOp::And: 6402 case AtomicRMWInst::BinOp::Or: 6403 case AtomicRMWInst::BinOp::Xor: 6404 case AtomicRMWInst::BinOp::Xchg: 6405 switch (ITy->getBitWidth()) { 6406 case 8: 6407 case 16: 6408 return AtomicExpansionKind::CmpXChg; 6409 case 32: 6410 return AtomicExpansionKind::None; 6411 case 64: 6412 if (STI.hasAtomBitwise64()) 6413 return AtomicExpansionKind::None; 6414 return AtomicExpansionKind::CmpXChg; 6415 default: 6416 llvm_unreachable("unsupported width encountered"); 6417 } 6418 case AtomicRMWInst::BinOp::Add: 6419 case AtomicRMWInst::BinOp::Sub: 6420 case AtomicRMWInst::BinOp::Max: 6421 case AtomicRMWInst::BinOp::Min: 6422 case AtomicRMWInst::BinOp::UMax: 6423 case AtomicRMWInst::BinOp::UMin: 6424 switch (ITy->getBitWidth()) { 6425 case 8: 6426 case 16: 6427 return AtomicExpansionKind::CmpXChg; 6428 case 32: 6429 return AtomicExpansionKind::None; 6430 case 64: 6431 if (STI.hasAtomMinMax64()) 6432 return AtomicExpansionKind::None; 6433 return AtomicExpansionKind::CmpXChg; 6434 default: 6435 llvm_unreachable("unsupported width encountered"); 6436 } 6437 } 6438 6439 return AtomicExpansionKind::CmpXChg; 6440 } 6441 6442 // Pin NVPTXTargetObjectFile's vtables to this file. 6443 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 6444 6445 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 6446 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 6447 return getDataSection(); 6448 } 6449