1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that NVPTX uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "NVPTXISelLowering.h" 15 #include "MCTargetDesc/NVPTXBaseInfo.h" 16 #include "NVPTX.h" 17 #include "NVPTXSubtarget.h" 18 #include "NVPTXTargetMachine.h" 19 #include "NVPTXTargetObjectFile.h" 20 #include "NVPTXUtilities.h" 21 #include "llvm/ADT/APInt.h" 22 #include "llvm/ADT/STLExtras.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/ADT/StringRef.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/ISDOpcodes.h" 27 #include "llvm/CodeGen/MachineFunction.h" 28 #include "llvm/CodeGen/MachineMemOperand.h" 29 #include "llvm/CodeGen/SelectionDAG.h" 30 #include "llvm/CodeGen/SelectionDAGNodes.h" 31 #include "llvm/CodeGen/TargetCallingConv.h" 32 #include "llvm/CodeGen/TargetLowering.h" 33 #include "llvm/CodeGen/ValueTypes.h" 34 #include "llvm/CodeGenTypes/MachineValueType.h" 35 #include "llvm/IR/Argument.h" 36 #include "llvm/IR/Attributes.h" 37 #include "llvm/IR/Constants.h" 38 #include "llvm/IR/DataLayout.h" 39 #include "llvm/IR/DerivedTypes.h" 40 #include "llvm/IR/DiagnosticInfo.h" 41 #include "llvm/IR/FPEnv.h" 42 #include "llvm/IR/Function.h" 43 #include "llvm/IR/GlobalValue.h" 44 #include "llvm/IR/Instruction.h" 45 #include "llvm/IR/Instructions.h" 46 #include "llvm/IR/IntrinsicsNVPTX.h" 47 #include "llvm/IR/Module.h" 48 #include "llvm/IR/Type.h" 49 #include "llvm/IR/Value.h" 50 #include "llvm/Support/Alignment.h" 51 #include "llvm/Support/Casting.h" 52 #include "llvm/Support/CodeGen.h" 53 #include "llvm/Support/CommandLine.h" 54 #include "llvm/Support/ErrorHandling.h" 55 #include "llvm/Support/raw_ostream.h" 56 #include "llvm/Target/TargetMachine.h" 57 #include "llvm/Target/TargetOptions.h" 58 #include <algorithm> 59 #include <cassert> 60 #include <cmath> 61 #include <cstdint> 62 #include <iterator> 63 #include <optional> 64 #include <sstream> 65 #include <string> 66 #include <utility> 67 #include <vector> 68 69 #define DEBUG_TYPE "nvptx-lower" 70 71 using namespace llvm; 72 73 static std::atomic<unsigned> GlobalUniqueCallSite; 74 75 static cl::opt<bool> sched4reg( 76 "nvptx-sched4reg", 77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false)); 78 79 static cl::opt<unsigned> FMAContractLevelOpt( 80 "nvptx-fma-level", cl::Hidden, 81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it" 82 " 1: do it 2: do it aggressively"), 83 cl::init(2)); 84 85 static cl::opt<int> UsePrecDivF32( 86 "nvptx-prec-divf32", cl::Hidden, 87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 88 " IEEE Compliant F32 div.rnd if available."), 89 cl::init(2)); 90 91 static cl::opt<bool> UsePrecSqrtF32( 92 "nvptx-prec-sqrtf32", cl::Hidden, 93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), 94 cl::init(true)); 95 96 static cl::opt<bool> ForceMinByValParamAlign( 97 "nvptx-force-min-byval-param-align", cl::Hidden, 98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" 99 " params of device functions."), 100 cl::init(false)); 101 102 int NVPTXTargetLowering::getDivF32Level() const { 103 if (UsePrecDivF32.getNumOccurrences() > 0) { 104 // If nvptx-prec-div32=N is used on the command-line, always honor it 105 return UsePrecDivF32; 106 } else { 107 // Otherwise, use div.approx if fast math is enabled 108 if (getTargetMachine().Options.UnsafeFPMath) 109 return 0; 110 else 111 return 2; 112 } 113 } 114 115 bool NVPTXTargetLowering::usePrecSqrtF32() const { 116 if (UsePrecSqrtF32.getNumOccurrences() > 0) { 117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it 118 return UsePrecSqrtF32; 119 } else { 120 // Otherwise, use sqrt.approx if fast math is enabled 121 return !getTargetMachine().Options.UnsafeFPMath; 122 } 123 } 124 125 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { 126 return MF.getDenormalMode(APFloat::IEEEsingle()).Output == 127 DenormalMode::PreserveSign; 128 } 129 130 static bool IsPTXVectorType(MVT VT) { 131 switch (VT.SimpleTy) { 132 default: 133 return false; 134 case MVT::v2i1: 135 case MVT::v4i1: 136 case MVT::v2i8: 137 case MVT::v4i8: 138 case MVT::v2i16: 139 case MVT::v4i16: 140 case MVT::v8i16: // <4 x i16x2> 141 case MVT::v2i32: 142 case MVT::v4i32: 143 case MVT::v2i64: 144 case MVT::v2f16: 145 case MVT::v4f16: 146 case MVT::v8f16: // <4 x f16x2> 147 case MVT::v2bf16: 148 case MVT::v4bf16: 149 case MVT::v8bf16: // <4 x bf16x2> 150 case MVT::v2f32: 151 case MVT::v4f32: 152 case MVT::v2f64: 153 return true; 154 } 155 } 156 157 static bool Is16bitsType(MVT VT) { 158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || 159 VT.SimpleTy == MVT::i16); 160 } 161 162 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 163 /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 164 /// into their primitive components. 165 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 166 /// same number of types as the Ins/Outs arrays in LowerFormalArguments, 167 /// LowerCall, and LowerReturn. 168 static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, 169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs, 170 SmallVectorImpl<uint64_t> *Offsets = nullptr, 171 uint64_t StartingOffset = 0) { 172 SmallVector<EVT, 16> TempVTs; 173 SmallVector<uint64_t, 16> TempOffsets; 174 175 // Special case for i128 - decompose to (i64, i64) 176 if (Ty->isIntegerTy(128)) { 177 ValueVTs.push_back(EVT(MVT::i64)); 178 ValueVTs.push_back(EVT(MVT::i64)); 179 180 if (Offsets) { 181 Offsets->push_back(StartingOffset + 0); 182 Offsets->push_back(StartingOffset + 8); 183 } 184 185 return; 186 } 187 188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 189 if (StructType *STy = dyn_cast<StructType>(Ty)) { 190 auto const *SL = DL.getStructLayout(STy); 191 auto ElementNum = 0; 192 for(auto *EI : STy->elements()) { 193 ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets, 194 StartingOffset + SL->getElementOffset(ElementNum)); 195 ++ElementNum; 196 } 197 return; 198 } 199 200 ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); 201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { 202 EVT VT = TempVTs[i]; 203 uint64_t Off = TempOffsets[i]; 204 // Split vectors into individual elements, except for v2f16, which 205 // we will pass as a single scalar. 206 if (VT.isVector()) { 207 unsigned NumElts = VT.getVectorNumElements(); 208 EVT EltVT = VT.getVectorElementType(); 209 // Vectors with an even number of f16 elements will be passed to 210 // us as an array of v2f16/v2bf16 elements. We must match this so we 211 // stay in sync with Ins/Outs. 212 if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0) { 213 switch (EltVT.getSimpleVT().SimpleTy) { 214 case MVT::f16: 215 EltVT = MVT::v2f16; 216 break; 217 case MVT::bf16: 218 EltVT = MVT::v2bf16; 219 break; 220 case MVT::i16: 221 EltVT = MVT::v2i16; 222 break; 223 default: 224 llvm_unreachable("Unexpected type"); 225 } 226 NumElts /= 2; 227 } else if (EltVT.getSimpleVT() == MVT::i8 && 228 (NumElts % 4 == 0 || NumElts == 3)) { 229 // v*i8 are formally lowered as v4i8 230 EltVT = MVT::v4i8; 231 NumElts = (NumElts + 3) / 4; 232 } 233 for (unsigned j = 0; j != NumElts; ++j) { 234 ValueVTs.push_back(EltVT); 235 if (Offsets) 236 Offsets->push_back(Off + j * EltVT.getStoreSize()); 237 } 238 } else { 239 ValueVTs.push_back(VT); 240 if (Offsets) 241 Offsets->push_back(Off); 242 } 243 } 244 } 245 246 /// PromoteScalarIntegerPTX 247 /// Used to make sure the arguments/returns are suitable for passing 248 /// and promote them to a larger size if they're not. 249 /// 250 /// The promoted type is placed in \p PromoteVT if the function returns true. 251 static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { 252 if (VT.isScalarInteger()) { 253 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) { 254 default: 255 llvm_unreachable( 256 "Promotion is not suitable for scalars of size larger than 64-bits"); 257 case 1: 258 *PromotedVT = MVT::i1; 259 break; 260 case 2: 261 case 4: 262 case 8: 263 *PromotedVT = MVT::i8; 264 break; 265 case 16: 266 *PromotedVT = MVT::i16; 267 break; 268 case 32: 269 *PromotedVT = MVT::i32; 270 break; 271 case 64: 272 *PromotedVT = MVT::i64; 273 break; 274 } 275 return EVT(*PromotedVT) != VT; 276 } 277 return false; 278 } 279 280 // Check whether we can merge loads/stores of some of the pieces of a 281 // flattened function parameter or return value into a single vector 282 // load/store. 283 // 284 // The flattened parameter is represented as a list of EVTs and 285 // offsets, and the whole structure is aligned to ParamAlignment. This 286 // function determines whether we can load/store pieces of the 287 // parameter starting at index Idx using a single vectorized op of 288 // size AccessSize. If so, it returns the number of param pieces 289 // covered by the vector op. Otherwise, it returns 1. 290 static unsigned CanMergeParamLoadStoresStartingAt( 291 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, 292 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { 293 294 // Can't vectorize if param alignment is not sufficient. 295 if (ParamAlignment < AccessSize) 296 return 1; 297 // Can't vectorize if offset is not aligned. 298 if (Offsets[Idx] & (AccessSize - 1)) 299 return 1; 300 301 EVT EltVT = ValueVTs[Idx]; 302 unsigned EltSize = EltVT.getStoreSize(); 303 304 // Element is too large to vectorize. 305 if (EltSize >= AccessSize) 306 return 1; 307 308 unsigned NumElts = AccessSize / EltSize; 309 // Can't vectorize if AccessBytes if not a multiple of EltSize. 310 if (AccessSize != EltSize * NumElts) 311 return 1; 312 313 // We don't have enough elements to vectorize. 314 if (Idx + NumElts > ValueVTs.size()) 315 return 1; 316 317 // PTX ISA can only deal with 2- and 4-element vector ops. 318 if (NumElts != 4 && NumElts != 2) 319 return 1; 320 321 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { 322 // Types do not match. 323 if (ValueVTs[j] != EltVT) 324 return 1; 325 326 // Elements are not contiguous. 327 if (Offsets[j] - Offsets[j - 1] != EltSize) 328 return 1; 329 } 330 // OK. We can vectorize ValueVTs[i..i+NumElts) 331 return NumElts; 332 } 333 334 // Flags for tracking per-element vectorization state of loads/stores 335 // of a flattened function parameter or return value. 336 enum ParamVectorizationFlags { 337 PVF_INNER = 0x0, // Middle elements of a vector. 338 PVF_FIRST = 0x1, // First element of the vector. 339 PVF_LAST = 0x2, // Last element of the vector. 340 // Scalar is effectively a 1-element vector. 341 PVF_SCALAR = PVF_FIRST | PVF_LAST 342 }; 343 344 // Computes whether and how we can vectorize the loads/stores of a 345 // flattened function parameter or return value. 346 // 347 // The flattened parameter is represented as the list of ValueVTs and 348 // Offsets, and is aligned to ParamAlignment bytes. We return a vector 349 // of the same size as ValueVTs indicating how each piece should be 350 // loaded/stored (i.e. as a scalar, or as part of a vector 351 // load/store). 352 static SmallVector<ParamVectorizationFlags, 16> 353 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, 354 const SmallVectorImpl<uint64_t> &Offsets, 355 Align ParamAlignment, bool IsVAArg = false) { 356 // Set vector size to match ValueVTs and mark all elements as 357 // scalars by default. 358 SmallVector<ParamVectorizationFlags, 16> VectorInfo; 359 VectorInfo.assign(ValueVTs.size(), PVF_SCALAR); 360 361 if (IsVAArg) 362 return VectorInfo; 363 364 // Check what we can vectorize using 128/64/32-bit accesses. 365 for (int I = 0, E = ValueVTs.size(); I != E; ++I) { 366 // Skip elements we've already processed. 367 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state."); 368 for (unsigned AccessSize : {16, 8, 4, 2}) { 369 unsigned NumElts = CanMergeParamLoadStoresStartingAt( 370 I, AccessSize, ValueVTs, Offsets, ParamAlignment); 371 // Mark vectorized elements. 372 switch (NumElts) { 373 default: 374 llvm_unreachable("Unexpected return value"); 375 case 1: 376 // Can't vectorize using this size, try next smaller size. 377 continue; 378 case 2: 379 assert(I + 1 < E && "Not enough elements."); 380 VectorInfo[I] = PVF_FIRST; 381 VectorInfo[I + 1] = PVF_LAST; 382 I += 1; 383 break; 384 case 4: 385 assert(I + 3 < E && "Not enough elements."); 386 VectorInfo[I] = PVF_FIRST; 387 VectorInfo[I + 1] = PVF_INNER; 388 VectorInfo[I + 2] = PVF_INNER; 389 VectorInfo[I + 3] = PVF_LAST; 390 I += 3; 391 break; 392 } 393 // Break out of the inner loop because we've already succeeded 394 // using largest possible AccessSize. 395 break; 396 } 397 } 398 return VectorInfo; 399 } 400 401 // NVPTXTargetLowering Constructor. 402 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, 403 const NVPTXSubtarget &STI) 404 : TargetLowering(TM), nvTM(&TM), STI(STI) { 405 // always lower memset, memcpy, and memmove intrinsics to load/store 406 // instructions, rather 407 // then generating calls to memset, mempcy or memmove. 408 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; 409 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; 410 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; 411 412 setBooleanContents(ZeroOrNegativeOneBooleanContent); 413 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 414 415 // Jump is Expensive. Don't create extra control flow for 'and', 'or' 416 // condition branches. 417 setJumpIsExpensive(true); 418 419 // Wide divides are _very_ slow. Try to reduce the width of the divide if 420 // possible. 421 addBypassSlowDiv(64, 32); 422 423 // By default, use the Source scheduling 424 if (sched4reg) 425 setSchedulingPreference(Sched::RegPressure); 426 else 427 setSchedulingPreference(Sched::Source); 428 429 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 430 LegalizeAction NoF16Action) { 431 setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action); 432 }; 433 434 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 435 LegalizeAction NoBF16Action) { 436 bool IsOpSupported = STI.hasBF16Math(); 437 // Few instructions are available on sm_90 only 438 switch(Op) { 439 case ISD::FADD: 440 case ISD::FMUL: 441 case ISD::FSUB: 442 case ISD::SELECT: 443 case ISD::SELECT_CC: 444 case ISD::SETCC: 445 case ISD::FEXP2: 446 case ISD::FCEIL: 447 case ISD::FFLOOR: 448 case ISD::FNEARBYINT: 449 case ISD::FRINT: 450 case ISD::FROUNDEVEN: 451 case ISD::FTRUNC: 452 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; 453 break; 454 } 455 setOperationAction( 456 Op, VT, IsOpSupported ? Action : NoBF16Action); 457 }; 458 459 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, 460 LegalizeAction NoI16x2Action) { 461 bool IsOpSupported = false; 462 // instructions are available on sm_90 only 463 switch (Op) { 464 case ISD::ADD: 465 case ISD::SMAX: 466 case ISD::SMIN: 467 case ISD::UMIN: 468 case ISD::UMAX: 469 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; 470 break; 471 } 472 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action); 473 }; 474 475 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); 476 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); 477 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); 478 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); 479 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); 480 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); 481 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); 482 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); 483 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); 484 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); 485 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); 486 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); 487 488 // Conversion to/from FP16/FP16x2 is always legal. 489 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); 490 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); 491 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); 492 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); 493 494 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); 495 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31) 496 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); 497 498 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); 499 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); 500 501 // Conversion to/from BFP16/BFP16x2 is always legal. 502 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); 503 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); 504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); 505 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); 506 507 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); 508 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); 509 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) 510 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); 511 512 // Conversion to/from i16/i16x2 is always legal. 513 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); 514 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); 515 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); 516 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); 517 518 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); 519 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); 520 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); 521 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); 522 // Only logical ops can be done on v4i8 directly, others must be done 523 // elementwise. 524 setOperationAction( 525 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, 526 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, 527 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, 528 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, 529 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, 530 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, 531 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, 532 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, 533 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, 534 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, 535 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, 536 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, 537 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, 538 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, 539 ISD::USUBSAT}, 540 MVT::v4i8, Expand); 541 542 // Operations not directly supported by NVPTX. 543 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, 544 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, 545 MVT::i32, MVT::i64}) { 546 setOperationAction(ISD::SELECT_CC, VT, Expand); 547 setOperationAction(ISD::BR_CC, VT, Expand); 548 } 549 550 // Some SIGN_EXTEND_INREG can be done using cvt instruction. 551 // For others we will expand to a SHL/SRA pair. 552 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); 553 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 554 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 555 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 556 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 557 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 558 559 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); 560 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); 561 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); 562 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); 563 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); 564 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); 565 566 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 567 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); 568 569 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs 570 // that don't have h/w rotation we lower them to multi-instruction assembly. 571 // See ROT*_sw in NVPTXIntrInfo.td 572 setOperationAction(ISD::ROTL, MVT::i64, Legal); 573 setOperationAction(ISD::ROTR, MVT::i64, Legal); 574 setOperationAction(ISD::ROTL, MVT::i32, Legal); 575 setOperationAction(ISD::ROTR, MVT::i32, Legal); 576 577 setOperationAction(ISD::ROTL, MVT::i16, Expand); 578 setOperationAction(ISD::ROTL, MVT::v2i16, Expand); 579 setOperationAction(ISD::ROTR, MVT::i16, Expand); 580 setOperationAction(ISD::ROTR, MVT::v2i16, Expand); 581 setOperationAction(ISD::ROTL, MVT::i8, Expand); 582 setOperationAction(ISD::ROTR, MVT::i8, Expand); 583 setOperationAction(ISD::BSWAP, MVT::i16, Expand); 584 585 // Indirect branch is not supported. 586 // This also disables Jump Table creation. 587 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 588 setOperationAction(ISD::BRIND, MVT::Other, Expand); 589 590 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 591 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 592 593 // We want to legalize constant related memmove and memcopy 594 // intrinsics. 595 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); 596 597 // Turn FP extload into load/fpextend 598 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); 599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); 600 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); 601 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); 602 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); 603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); 604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); 605 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); 606 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); 607 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); 608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); 609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); 610 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); 611 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); 612 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); 613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); 614 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); 615 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); 616 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); 617 // Turn FP truncstore into trunc + store. 618 // FIXME: vector types should also be expanded 619 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 620 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 621 setTruncStoreAction(MVT::f32, MVT::bf16, Expand); 622 setTruncStoreAction(MVT::f64, MVT::bf16, Expand); 623 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 624 625 // PTX does not support load / store predicate registers 626 setOperationAction(ISD::LOAD, MVT::i1, Custom); 627 setOperationAction(ISD::STORE, MVT::i1, Custom); 628 629 for (MVT VT : MVT::integer_valuetypes()) { 630 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 631 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 632 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 633 setTruncStoreAction(VT, MVT::i1, Expand); 634 } 635 636 // expand extload of vector of integers. 637 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, 638 MVT::v2i8, Expand); 639 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); 640 641 // This is legal in NVPTX 642 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 643 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 644 setOperationAction(ISD::ConstantFP, MVT::f16, Legal); 645 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); 646 647 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 648 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); 649 650 // TRAP can be lowered to PTX trap 651 setOperationAction(ISD::TRAP, MVT::Other, Legal); 652 653 // Register custom handling for vector loads/stores 654 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 655 if (IsPTXVectorType(VT)) { 656 setOperationAction(ISD::LOAD, VT, Custom); 657 setOperationAction(ISD::STORE, VT, Custom); 658 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); 659 } 660 } 661 662 // Support varargs. 663 setOperationAction(ISD::VASTART, MVT::Other, Custom); 664 setOperationAction(ISD::VAARG, MVT::Other, Custom); 665 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 666 setOperationAction(ISD::VAEND, MVT::Other, Expand); 667 668 // Custom handling for i8 intrinsics 669 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); 670 671 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { 672 setOperationAction(ISD::ABS, Ty, Legal); 673 setOperationAction(ISD::SMIN, Ty, Legal); 674 setOperationAction(ISD::SMAX, Ty, Legal); 675 setOperationAction(ISD::UMIN, Ty, Legal); 676 setOperationAction(ISD::UMAX, Ty, Legal); 677 678 setOperationAction(ISD::CTPOP, Ty, Legal); 679 setOperationAction(ISD::CTLZ, Ty, Legal); 680 } 681 682 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); 683 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); 684 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); 685 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); 686 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); 687 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); 688 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); 689 690 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); 691 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); 692 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); 693 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); 694 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); 695 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); 696 697 // Other arithmetic and logic ops are unsupported. 698 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, 699 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, 700 ISD::SINT_TO_FP, ISD::UINT_TO_FP}, 701 MVT::v2i16, Expand); 702 703 setOperationAction(ISD::ADDC, MVT::i32, Legal); 704 setOperationAction(ISD::ADDE, MVT::i32, Legal); 705 setOperationAction(ISD::SUBC, MVT::i32, Legal); 706 setOperationAction(ISD::SUBE, MVT::i32, Legal); 707 if (STI.getPTXVersion() >= 43) { 708 setOperationAction(ISD::ADDC, MVT::i64, Legal); 709 setOperationAction(ISD::ADDE, MVT::i64, Legal); 710 setOperationAction(ISD::SUBC, MVT::i64, Legal); 711 setOperationAction(ISD::SUBE, MVT::i64, Legal); 712 } 713 714 setOperationAction(ISD::CTTZ, MVT::i16, Expand); 715 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); 716 setOperationAction(ISD::CTTZ, MVT::i32, Expand); 717 setOperationAction(ISD::CTTZ, MVT::i64, Expand); 718 719 // PTX does not directly support SELP of i1, so promote to i32 first 720 setOperationAction(ISD::SELECT, MVT::i1, Custom); 721 722 // PTX cannot multiply two i64s in a single instruction. 723 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 724 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 725 726 // We have some custom DAG combine patterns for these nodes 727 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, 728 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, 729 ISD::VSELECT}); 730 731 // setcc for f16x2 and bf16x2 needs special handling to prevent 732 // legalizer's attempt to scalarize it due to v2i1 not being legal. 733 if (STI.allowFP16Math() || STI.hasBF16Math()) 734 setTargetDAGCombine(ISD::SETCC); 735 736 // Promote fp16 arithmetic if fp16 hardware isn't available or the 737 // user passed --nvptx-no-fp16-math. The flag is useful because, 738 // although sm_53+ GPUs have some sort of FP16 support in 739 // hardware, only sm_53 and sm_60 have full implementation. Others 740 // only have token amount of hardware and are likely to run faster 741 // by using fp32 units instead. 742 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { 743 setFP16OperationAction(Op, MVT::f16, Legal, Promote); 744 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); 745 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 746 // bf16 must be promoted to f32. 747 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 748 if (getOperationAction(Op, MVT::bf16) == Promote) 749 AddPromotedToType(Op, MVT::bf16, MVT::f32); 750 } 751 752 // f16/f16x2 neg was introduced in PTX 60, SM_53. 753 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && 754 STI.getPTXVersion() >= 60 && 755 STI.allowFP16Math(); 756 for (const auto &VT : {MVT::f16, MVT::v2f16}) 757 setOperationAction(ISD::FNEG, VT, 758 IsFP16FP16x2NegAvailable ? Legal : Expand); 759 760 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); 761 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); 762 // (would be) Library functions. 763 764 // These map to conversion instructions for scalar FP types. 765 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, 766 ISD::FROUNDEVEN, ISD::FTRUNC}) { 767 setOperationAction(Op, MVT::f16, Legal); 768 setOperationAction(Op, MVT::f32, Legal); 769 setOperationAction(Op, MVT::f64, Legal); 770 setOperationAction(Op, MVT::v2f16, Expand); 771 setOperationAction(Op, MVT::v2bf16, Expand); 772 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 773 if (getOperationAction(Op, MVT::bf16) == Promote) 774 AddPromotedToType(Op, MVT::bf16, MVT::f32); 775 } 776 777 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) { 778 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); 779 } 780 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 781 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) { 782 setOperationAction(ISD::FP_EXTEND, VT, Custom); 783 setOperationAction(ISD::FP_ROUND, VT, Custom); 784 } 785 } 786 787 // sm_80 only has conversions between f32 and bf16. Custom lower all other 788 // bf16 conversions. 789 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 790 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { 791 setOperationAction( 792 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 793 VT, Custom); 794 } 795 setOperationAction( 796 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, 797 MVT::bf16, Custom); 798 } 799 800 setOperationAction(ISD::FROUND, MVT::f16, Promote); 801 setOperationAction(ISD::FROUND, MVT::v2f16, Expand); 802 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); 803 setOperationAction(ISD::FROUND, MVT::f32, Custom); 804 setOperationAction(ISD::FROUND, MVT::f64, Custom); 805 setOperationAction(ISD::FROUND, MVT::bf16, Promote); 806 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); 807 808 // 'Expand' implements FCOPYSIGN without calling an external library. 809 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 810 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); 811 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); 812 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); 813 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 814 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 815 816 // These map to corresponding instructions for f32/f64. f16 must be 817 // promoted to f32. v2f16 is expanded to f16, which is then promoted 818 // to f32. 819 for (const auto &Op : 820 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { 821 setOperationAction(Op, MVT::f16, Promote); 822 setOperationAction(Op, MVT::f32, Legal); 823 setOperationAction(Op, MVT::f64, Legal); 824 setOperationAction(Op, MVT::v2f16, Expand); 825 setOperationAction(Op, MVT::v2bf16, Expand); 826 setOperationAction(Op, MVT::bf16, Promote); 827 AddPromotedToType(Op, MVT::bf16, MVT::f32); 828 } 829 for (const auto &Op : {ISD::FABS}) { 830 setOperationAction(Op, MVT::f16, Promote); 831 setOperationAction(Op, MVT::f32, Legal); 832 setOperationAction(Op, MVT::f64, Legal); 833 setOperationAction(Op, MVT::v2f16, Expand); 834 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 835 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 836 if (getOperationAction(Op, MVT::bf16) == Promote) 837 AddPromotedToType(Op, MVT::bf16, MVT::f32); 838 } 839 840 // max.f16, max.f16x2 and max.NaN are supported on sm_80+. 841 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { 842 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; 843 return IsAtLeastSm80 ? Legal : NotSm80Action; 844 }; 845 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { 846 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); 847 setOperationAction(Op, MVT::f32, Legal); 848 setOperationAction(Op, MVT::f64, Legal); 849 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 850 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 851 setBF16OperationAction(Op, MVT::bf16, Legal, Promote); 852 if (getOperationAction(Op, MVT::bf16) == Promote) 853 AddPromotedToType(Op, MVT::bf16, MVT::f32); 854 } 855 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { 856 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); 857 setFP16OperationAction(Op, MVT::bf16, Legal, Expand); 858 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); 859 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); 860 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); 861 } 862 863 // Custom lowering for inline asm with 128-bit operands 864 setOperationAction(ISD::CopyToReg, MVT::i128, Custom); 865 setOperationAction(ISD::CopyFromReg, MVT::i128, Custom); 866 867 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. 868 // No FPOW or FREM in PTX. 869 870 // Now deduce the information based on the above mentioned 871 // actions 872 computeRegisterProperties(STI.getRegisterInfo()); 873 874 setMinCmpXchgSizeInBits(32); 875 setMaxAtomicSizeInBitsSupported(64); 876 setMaxDivRemBitWidthSupported(64); 877 } 878 879 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { 880 881 #define MAKE_CASE(V) \ 882 case V: \ 883 return #V; 884 885 switch ((NVPTXISD::NodeType)Opcode) { 886 case NVPTXISD::FIRST_NUMBER: 887 break; 888 889 MAKE_CASE(NVPTXISD::CALL) 890 MAKE_CASE(NVPTXISD::RET_GLUE) 891 MAKE_CASE(NVPTXISD::LOAD_PARAM) 892 MAKE_CASE(NVPTXISD::Wrapper) 893 MAKE_CASE(NVPTXISD::DeclareParam) 894 MAKE_CASE(NVPTXISD::DeclareScalarParam) 895 MAKE_CASE(NVPTXISD::DeclareRet) 896 MAKE_CASE(NVPTXISD::DeclareScalarRet) 897 MAKE_CASE(NVPTXISD::DeclareRetParam) 898 MAKE_CASE(NVPTXISD::PrintCall) 899 MAKE_CASE(NVPTXISD::PrintConvergentCall) 900 MAKE_CASE(NVPTXISD::PrintCallUni) 901 MAKE_CASE(NVPTXISD::PrintConvergentCallUni) 902 MAKE_CASE(NVPTXISD::LoadParam) 903 MAKE_CASE(NVPTXISD::LoadParamV2) 904 MAKE_CASE(NVPTXISD::LoadParamV4) 905 MAKE_CASE(NVPTXISD::StoreParam) 906 MAKE_CASE(NVPTXISD::StoreParamV2) 907 MAKE_CASE(NVPTXISD::StoreParamV4) 908 MAKE_CASE(NVPTXISD::StoreParamS32) 909 MAKE_CASE(NVPTXISD::StoreParamU32) 910 MAKE_CASE(NVPTXISD::CallArgBegin) 911 MAKE_CASE(NVPTXISD::CallArg) 912 MAKE_CASE(NVPTXISD::LastCallArg) 913 MAKE_CASE(NVPTXISD::CallArgEnd) 914 MAKE_CASE(NVPTXISD::CallVoid) 915 MAKE_CASE(NVPTXISD::CallVal) 916 MAKE_CASE(NVPTXISD::CallSymbol) 917 MAKE_CASE(NVPTXISD::Prototype) 918 MAKE_CASE(NVPTXISD::MoveParam) 919 MAKE_CASE(NVPTXISD::StoreRetval) 920 MAKE_CASE(NVPTXISD::StoreRetvalV2) 921 MAKE_CASE(NVPTXISD::StoreRetvalV4) 922 MAKE_CASE(NVPTXISD::PseudoUseParam) 923 MAKE_CASE(NVPTXISD::RETURN) 924 MAKE_CASE(NVPTXISD::CallSeqBegin) 925 MAKE_CASE(NVPTXISD::CallSeqEnd) 926 MAKE_CASE(NVPTXISD::CallPrototype) 927 MAKE_CASE(NVPTXISD::ProxyReg) 928 MAKE_CASE(NVPTXISD::LoadV2) 929 MAKE_CASE(NVPTXISD::LoadV4) 930 MAKE_CASE(NVPTXISD::LDGV2) 931 MAKE_CASE(NVPTXISD::LDGV4) 932 MAKE_CASE(NVPTXISD::LDUV2) 933 MAKE_CASE(NVPTXISD::LDUV4) 934 MAKE_CASE(NVPTXISD::StoreV2) 935 MAKE_CASE(NVPTXISD::StoreV4) 936 MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP) 937 MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP) 938 MAKE_CASE(NVPTXISD::IMAD) 939 MAKE_CASE(NVPTXISD::BFE) 940 MAKE_CASE(NVPTXISD::BFI) 941 MAKE_CASE(NVPTXISD::PRMT) 942 MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) 943 MAKE_CASE(NVPTXISD::SETP_F16X2) 944 MAKE_CASE(NVPTXISD::SETP_BF16X2) 945 MAKE_CASE(NVPTXISD::Dummy) 946 MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED) 947 MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED) 948 MAKE_CASE(NVPTXISD::Tex1DFloatS32) 949 MAKE_CASE(NVPTXISD::Tex1DFloatFloat) 950 MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel) 951 MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad) 952 MAKE_CASE(NVPTXISD::Tex1DS32S32) 953 MAKE_CASE(NVPTXISD::Tex1DS32Float) 954 MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel) 955 MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad) 956 MAKE_CASE(NVPTXISD::Tex1DU32S32) 957 MAKE_CASE(NVPTXISD::Tex1DU32Float) 958 MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel) 959 MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad) 960 MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32) 961 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat) 962 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel) 963 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad) 964 MAKE_CASE(NVPTXISD::Tex1DArrayS32S32) 965 MAKE_CASE(NVPTXISD::Tex1DArrayS32Float) 966 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel) 967 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad) 968 MAKE_CASE(NVPTXISD::Tex1DArrayU32S32) 969 MAKE_CASE(NVPTXISD::Tex1DArrayU32Float) 970 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel) 971 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad) 972 MAKE_CASE(NVPTXISD::Tex2DFloatS32) 973 MAKE_CASE(NVPTXISD::Tex2DFloatFloat) 974 MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel) 975 MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad) 976 MAKE_CASE(NVPTXISD::Tex2DS32S32) 977 MAKE_CASE(NVPTXISD::Tex2DS32Float) 978 MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel) 979 MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad) 980 MAKE_CASE(NVPTXISD::Tex2DU32S32) 981 MAKE_CASE(NVPTXISD::Tex2DU32Float) 982 MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel) 983 MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad) 984 MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32) 985 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat) 986 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel) 987 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad) 988 MAKE_CASE(NVPTXISD::Tex2DArrayS32S32) 989 MAKE_CASE(NVPTXISD::Tex2DArrayS32Float) 990 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel) 991 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad) 992 MAKE_CASE(NVPTXISD::Tex2DArrayU32S32) 993 MAKE_CASE(NVPTXISD::Tex2DArrayU32Float) 994 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel) 995 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad) 996 MAKE_CASE(NVPTXISD::Tex3DFloatS32) 997 MAKE_CASE(NVPTXISD::Tex3DFloatFloat) 998 MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel) 999 MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad) 1000 MAKE_CASE(NVPTXISD::Tex3DS32S32) 1001 MAKE_CASE(NVPTXISD::Tex3DS32Float) 1002 MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel) 1003 MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad) 1004 MAKE_CASE(NVPTXISD::Tex3DU32S32) 1005 MAKE_CASE(NVPTXISD::Tex3DU32Float) 1006 MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel) 1007 MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad) 1008 MAKE_CASE(NVPTXISD::TexCubeFloatFloat) 1009 MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel) 1010 MAKE_CASE(NVPTXISD::TexCubeS32Float) 1011 MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel) 1012 MAKE_CASE(NVPTXISD::TexCubeU32Float) 1013 MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel) 1014 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat) 1015 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel) 1016 MAKE_CASE(NVPTXISD::TexCubeArrayS32Float) 1017 MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel) 1018 MAKE_CASE(NVPTXISD::TexCubeArrayU32Float) 1019 MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel) 1020 MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat) 1021 MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat) 1022 MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat) 1023 MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat) 1024 MAKE_CASE(NVPTXISD::Tld4R2DS64Float) 1025 MAKE_CASE(NVPTXISD::Tld4G2DS64Float) 1026 MAKE_CASE(NVPTXISD::Tld4B2DS64Float) 1027 MAKE_CASE(NVPTXISD::Tld4A2DS64Float) 1028 MAKE_CASE(NVPTXISD::Tld4R2DU64Float) 1029 MAKE_CASE(NVPTXISD::Tld4G2DU64Float) 1030 MAKE_CASE(NVPTXISD::Tld4B2DU64Float) 1031 MAKE_CASE(NVPTXISD::Tld4A2DU64Float) 1032 1033 MAKE_CASE(NVPTXISD::TexUnified1DFloatS32) 1034 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat) 1035 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel) 1036 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad) 1037 MAKE_CASE(NVPTXISD::TexUnified1DS32S32) 1038 MAKE_CASE(NVPTXISD::TexUnified1DS32Float) 1039 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel) 1040 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad) 1041 MAKE_CASE(NVPTXISD::TexUnified1DU32S32) 1042 MAKE_CASE(NVPTXISD::TexUnified1DU32Float) 1043 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel) 1044 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad) 1045 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32) 1046 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat) 1047 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel) 1048 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad) 1049 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32) 1050 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float) 1051 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel) 1052 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad) 1053 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32) 1054 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float) 1055 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel) 1056 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad) 1057 MAKE_CASE(NVPTXISD::TexUnified2DFloatS32) 1058 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat) 1059 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel) 1060 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad) 1061 MAKE_CASE(NVPTXISD::TexUnified2DS32S32) 1062 MAKE_CASE(NVPTXISD::TexUnified2DS32Float) 1063 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel) 1064 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad) 1065 MAKE_CASE(NVPTXISD::TexUnified2DU32S32) 1066 MAKE_CASE(NVPTXISD::TexUnified2DU32Float) 1067 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel) 1068 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad) 1069 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32) 1070 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat) 1071 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel) 1072 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad) 1073 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32) 1074 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float) 1075 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel) 1076 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad) 1077 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32) 1078 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float) 1079 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel) 1080 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad) 1081 MAKE_CASE(NVPTXISD::TexUnified3DFloatS32) 1082 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat) 1083 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel) 1084 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad) 1085 MAKE_CASE(NVPTXISD::TexUnified3DS32S32) 1086 MAKE_CASE(NVPTXISD::TexUnified3DS32Float) 1087 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel) 1088 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad) 1089 MAKE_CASE(NVPTXISD::TexUnified3DU32S32) 1090 MAKE_CASE(NVPTXISD::TexUnified3DU32Float) 1091 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel) 1092 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad) 1093 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat) 1094 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel) 1095 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float) 1096 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel) 1097 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float) 1098 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel) 1099 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat) 1100 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel) 1101 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float) 1102 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel) 1103 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float) 1104 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel) 1105 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad) 1106 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad) 1107 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad) 1108 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad) 1109 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad) 1110 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad) 1111 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat) 1112 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat) 1113 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat) 1114 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat) 1115 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float) 1116 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float) 1117 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float) 1118 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float) 1119 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float) 1120 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float) 1121 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float) 1122 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float) 1123 1124 MAKE_CASE(NVPTXISD::Suld1DI8Clamp) 1125 MAKE_CASE(NVPTXISD::Suld1DI16Clamp) 1126 MAKE_CASE(NVPTXISD::Suld1DI32Clamp) 1127 MAKE_CASE(NVPTXISD::Suld1DI64Clamp) 1128 MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp) 1129 MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp) 1130 MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp) 1131 MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp) 1132 MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp) 1133 MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp) 1134 MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp) 1135 1136 MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp) 1137 MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp) 1138 MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp) 1139 MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp) 1140 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp) 1141 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp) 1142 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp) 1143 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp) 1144 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp) 1145 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp) 1146 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp) 1147 1148 MAKE_CASE(NVPTXISD::Suld2DI8Clamp) 1149 MAKE_CASE(NVPTXISD::Suld2DI16Clamp) 1150 MAKE_CASE(NVPTXISD::Suld2DI32Clamp) 1151 MAKE_CASE(NVPTXISD::Suld2DI64Clamp) 1152 MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp) 1153 MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp) 1154 MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp) 1155 MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp) 1156 MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp) 1157 MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp) 1158 MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp) 1159 1160 MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp) 1161 MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp) 1162 MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp) 1163 MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp) 1164 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp) 1165 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp) 1166 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp) 1167 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp) 1168 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp) 1169 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp) 1170 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp) 1171 1172 MAKE_CASE(NVPTXISD::Suld3DI8Clamp) 1173 MAKE_CASE(NVPTXISD::Suld3DI16Clamp) 1174 MAKE_CASE(NVPTXISD::Suld3DI32Clamp) 1175 MAKE_CASE(NVPTXISD::Suld3DI64Clamp) 1176 MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp) 1177 MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp) 1178 MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp) 1179 MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp) 1180 MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp) 1181 MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp) 1182 MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp) 1183 1184 MAKE_CASE(NVPTXISD::Suld1DI8Trap) 1185 MAKE_CASE(NVPTXISD::Suld1DI16Trap) 1186 MAKE_CASE(NVPTXISD::Suld1DI32Trap) 1187 MAKE_CASE(NVPTXISD::Suld1DI64Trap) 1188 MAKE_CASE(NVPTXISD::Suld1DV2I8Trap) 1189 MAKE_CASE(NVPTXISD::Suld1DV2I16Trap) 1190 MAKE_CASE(NVPTXISD::Suld1DV2I32Trap) 1191 MAKE_CASE(NVPTXISD::Suld1DV2I64Trap) 1192 MAKE_CASE(NVPTXISD::Suld1DV4I8Trap) 1193 MAKE_CASE(NVPTXISD::Suld1DV4I16Trap) 1194 MAKE_CASE(NVPTXISD::Suld1DV4I32Trap) 1195 1196 MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap) 1197 MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap) 1198 MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap) 1199 MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap) 1200 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap) 1201 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap) 1202 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap) 1203 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap) 1204 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap) 1205 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap) 1206 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap) 1207 1208 MAKE_CASE(NVPTXISD::Suld2DI8Trap) 1209 MAKE_CASE(NVPTXISD::Suld2DI16Trap) 1210 MAKE_CASE(NVPTXISD::Suld2DI32Trap) 1211 MAKE_CASE(NVPTXISD::Suld2DI64Trap) 1212 MAKE_CASE(NVPTXISD::Suld2DV2I8Trap) 1213 MAKE_CASE(NVPTXISD::Suld2DV2I16Trap) 1214 MAKE_CASE(NVPTXISD::Suld2DV2I32Trap) 1215 MAKE_CASE(NVPTXISD::Suld2DV2I64Trap) 1216 MAKE_CASE(NVPTXISD::Suld2DV4I8Trap) 1217 MAKE_CASE(NVPTXISD::Suld2DV4I16Trap) 1218 MAKE_CASE(NVPTXISD::Suld2DV4I32Trap) 1219 1220 MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap) 1221 MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap) 1222 MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap) 1223 MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap) 1224 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap) 1225 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap) 1226 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap) 1227 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap) 1228 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap) 1229 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap) 1230 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap) 1231 1232 MAKE_CASE(NVPTXISD::Suld3DI8Trap) 1233 MAKE_CASE(NVPTXISD::Suld3DI16Trap) 1234 MAKE_CASE(NVPTXISD::Suld3DI32Trap) 1235 MAKE_CASE(NVPTXISD::Suld3DI64Trap) 1236 MAKE_CASE(NVPTXISD::Suld3DV2I8Trap) 1237 MAKE_CASE(NVPTXISD::Suld3DV2I16Trap) 1238 MAKE_CASE(NVPTXISD::Suld3DV2I32Trap) 1239 MAKE_CASE(NVPTXISD::Suld3DV2I64Trap) 1240 MAKE_CASE(NVPTXISD::Suld3DV4I8Trap) 1241 MAKE_CASE(NVPTXISD::Suld3DV4I16Trap) 1242 MAKE_CASE(NVPTXISD::Suld3DV4I32Trap) 1243 1244 MAKE_CASE(NVPTXISD::Suld1DI8Zero) 1245 MAKE_CASE(NVPTXISD::Suld1DI16Zero) 1246 MAKE_CASE(NVPTXISD::Suld1DI32Zero) 1247 MAKE_CASE(NVPTXISD::Suld1DI64Zero) 1248 MAKE_CASE(NVPTXISD::Suld1DV2I8Zero) 1249 MAKE_CASE(NVPTXISD::Suld1DV2I16Zero) 1250 MAKE_CASE(NVPTXISD::Suld1DV2I32Zero) 1251 MAKE_CASE(NVPTXISD::Suld1DV2I64Zero) 1252 MAKE_CASE(NVPTXISD::Suld1DV4I8Zero) 1253 MAKE_CASE(NVPTXISD::Suld1DV4I16Zero) 1254 MAKE_CASE(NVPTXISD::Suld1DV4I32Zero) 1255 1256 MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero) 1257 MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero) 1258 MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero) 1259 MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero) 1260 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero) 1261 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero) 1262 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero) 1263 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero) 1264 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero) 1265 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero) 1266 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero) 1267 1268 MAKE_CASE(NVPTXISD::Suld2DI8Zero) 1269 MAKE_CASE(NVPTXISD::Suld2DI16Zero) 1270 MAKE_CASE(NVPTXISD::Suld2DI32Zero) 1271 MAKE_CASE(NVPTXISD::Suld2DI64Zero) 1272 MAKE_CASE(NVPTXISD::Suld2DV2I8Zero) 1273 MAKE_CASE(NVPTXISD::Suld2DV2I16Zero) 1274 MAKE_CASE(NVPTXISD::Suld2DV2I32Zero) 1275 MAKE_CASE(NVPTXISD::Suld2DV2I64Zero) 1276 MAKE_CASE(NVPTXISD::Suld2DV4I8Zero) 1277 MAKE_CASE(NVPTXISD::Suld2DV4I16Zero) 1278 MAKE_CASE(NVPTXISD::Suld2DV4I32Zero) 1279 1280 MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero) 1281 MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero) 1282 MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero) 1283 MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero) 1284 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero) 1285 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero) 1286 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero) 1287 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero) 1288 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero) 1289 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero) 1290 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero) 1291 1292 MAKE_CASE(NVPTXISD::Suld3DI8Zero) 1293 MAKE_CASE(NVPTXISD::Suld3DI16Zero) 1294 MAKE_CASE(NVPTXISD::Suld3DI32Zero) 1295 MAKE_CASE(NVPTXISD::Suld3DI64Zero) 1296 MAKE_CASE(NVPTXISD::Suld3DV2I8Zero) 1297 MAKE_CASE(NVPTXISD::Suld3DV2I16Zero) 1298 MAKE_CASE(NVPTXISD::Suld3DV2I32Zero) 1299 MAKE_CASE(NVPTXISD::Suld3DV2I64Zero) 1300 MAKE_CASE(NVPTXISD::Suld3DV4I8Zero) 1301 MAKE_CASE(NVPTXISD::Suld3DV4I16Zero) 1302 MAKE_CASE(NVPTXISD::Suld3DV4I32Zero) 1303 } 1304 return nullptr; 1305 1306 #undef MAKE_CASE 1307 } 1308 1309 TargetLoweringBase::LegalizeTypeAction 1310 NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { 1311 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && 1312 VT.getScalarType() == MVT::i1) 1313 return TypeSplitVector; 1314 if (Isv2x16VT(VT)) 1315 return TypeLegal; 1316 return TargetLoweringBase::getPreferredVectorAction(VT); 1317 } 1318 1319 SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, 1320 int Enabled, int &ExtraSteps, 1321 bool &UseOneConst, 1322 bool Reciprocal) const { 1323 if (!(Enabled == ReciprocalEstimate::Enabled || 1324 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) 1325 return SDValue(); 1326 1327 if (ExtraSteps == ReciprocalEstimate::Unspecified) 1328 ExtraSteps = 0; 1329 1330 SDLoc DL(Operand); 1331 EVT VT = Operand.getValueType(); 1332 bool Ftz = useF32FTZ(DAG.getMachineFunction()); 1333 1334 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { 1335 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 1336 DAG.getConstant(IID, DL, MVT::i32), Operand); 1337 }; 1338 1339 // The sqrt and rsqrt refinement processes assume we always start out with an 1340 // approximation of the rsqrt. Therefore, if we're going to do any refinement 1341 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1342 // any refinement, we must return a regular sqrt. 1343 if (Reciprocal || ExtraSteps > 0) { 1344 if (VT == MVT::f32) 1345 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f 1346 : Intrinsic::nvvm_rsqrt_approx_f); 1347 else if (VT == MVT::f64) 1348 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); 1349 else 1350 return SDValue(); 1351 } else { 1352 if (VT == MVT::f32) 1353 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f 1354 : Intrinsic::nvvm_sqrt_approx_f); 1355 else { 1356 // There's no sqrt.approx.f64 instruction, so we emit 1357 // reciprocal(rsqrt(x)). This is faster than 1358 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1359 // x * rsqrt(x).) 1360 return DAG.getNode( 1361 ISD::INTRINSIC_WO_CHAIN, DL, VT, 1362 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), 1363 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); 1364 } 1365 } 1366 } 1367 1368 SDValue 1369 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 1370 SDLoc dl(Op); 1371 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op); 1372 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); 1373 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); 1374 return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op); 1375 } 1376 1377 static bool IsTypePassedAsArray(const Type *Ty) { 1378 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || 1379 Ty->isHalfTy() || Ty->isBFloatTy(); 1380 } 1381 1382 std::string NVPTXTargetLowering::getPrototype( 1383 const DataLayout &DL, Type *retTy, const ArgListTy &Args, 1384 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, 1385 std::optional<std::pair<unsigned, const APInt &>> VAInfo, 1386 const CallBase &CB, unsigned UniqueCallSite) const { 1387 auto PtrVT = getPointerTy(DL); 1388 1389 bool isABI = (STI.getSmVersion() >= 20); 1390 assert(isABI && "Non-ABI compilation is not supported"); 1391 if (!isABI) 1392 return ""; 1393 1394 std::string Prototype; 1395 raw_string_ostream O(Prototype); 1396 O << "prototype_" << UniqueCallSite << " : .callprototype "; 1397 1398 if (retTy->getTypeID() == Type::VoidTyID) { 1399 O << "()"; 1400 } else { 1401 O << "("; 1402 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && 1403 !IsTypePassedAsArray(retTy)) { 1404 unsigned size = 0; 1405 if (auto *ITy = dyn_cast<IntegerType>(retTy)) { 1406 size = ITy->getBitWidth(); 1407 } else { 1408 assert(retTy->isFloatingPointTy() && 1409 "Floating point type expected here"); 1410 size = retTy->getPrimitiveSizeInBits(); 1411 } 1412 // PTX ABI requires all scalar return values to be at least 32 1413 // bits in size. fp16 normally uses .b16 as its storage type in 1414 // PTX, so its size must be adjusted here, too. 1415 size = promoteScalarArgumentSize(size); 1416 1417 O << ".param .b" << size << " _"; 1418 } else if (isa<PointerType>(retTy)) { 1419 O << ".param .b" << PtrVT.getSizeInBits() << " _"; 1420 } else if (IsTypePassedAsArray(retTy)) { 1421 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) 1422 << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; 1423 } else { 1424 llvm_unreachable("Unknown return type"); 1425 } 1426 O << ") "; 1427 } 1428 O << "_ ("; 1429 1430 bool first = true; 1431 1432 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); 1433 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { 1434 Type *Ty = Args[i].Ty; 1435 if (!first) { 1436 O << ", "; 1437 } 1438 first = false; 1439 1440 if (!Outs[OIdx].Flags.isByVal()) { 1441 if (IsTypePassedAsArray(Ty)) { 1442 Align ParamAlign = 1443 getArgumentAlignment(&CB, Ty, i + AttributeList::FirstArgIndex, DL); 1444 O << ".param .align " << ParamAlign.value() << " .b8 "; 1445 O << "_"; 1446 O << "[" << DL.getTypeAllocSize(Ty) << "]"; 1447 // update the index for Outs 1448 SmallVector<EVT, 16> vtparts; 1449 ComputeValueVTs(*this, DL, Ty, vtparts); 1450 if (unsigned len = vtparts.size()) 1451 OIdx += len - 1; 1452 continue; 1453 } 1454 // i8 types in IR will be i16 types in SDAG 1455 assert((getValueType(DL, Ty) == Outs[OIdx].VT || 1456 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && 1457 "type mismatch between callee prototype and arguments"); 1458 // scalar type 1459 unsigned sz = 0; 1460 if (isa<IntegerType>(Ty)) { 1461 sz = cast<IntegerType>(Ty)->getBitWidth(); 1462 sz = promoteScalarArgumentSize(sz); 1463 } else if (isa<PointerType>(Ty)) { 1464 sz = PtrVT.getSizeInBits(); 1465 } else { 1466 sz = Ty->getPrimitiveSizeInBits(); 1467 } 1468 O << ".param .b" << sz << " "; 1469 O << "_"; 1470 continue; 1471 } 1472 1473 // Indirect calls need strict ABI alignment so we disable optimizations by 1474 // not providing a function to optimize. 1475 Type *ETy = Args[i].IndirectType; 1476 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1477 Align ParamByValAlign = 1478 getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL); 1479 1480 O << ".param .align " << ParamByValAlign.value() << " .b8 "; 1481 O << "_"; 1482 O << "[" << Outs[OIdx].Flags.getByValSize() << "]"; 1483 } 1484 1485 if (VAInfo) 1486 O << (first ? "" : ",") << " .param .align " << VAInfo->second 1487 << " .b8 _[]\n"; 1488 O << ")"; 1489 if (shouldEmitPTXNoReturn(&CB, *nvTM)) 1490 O << " .noreturn"; 1491 O << ";"; 1492 1493 return Prototype; 1494 } 1495 1496 Align NVPTXTargetLowering::getFunctionArgumentAlignment( 1497 const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const { 1498 return getAlign(*F, Idx).value_or(getFunctionParamOptimizedAlign(F, Ty, DL)); 1499 } 1500 1501 Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, 1502 unsigned Idx, 1503 const DataLayout &DL) const { 1504 if (!CB) { 1505 // CallSite is zero, fallback to ABI type alignment 1506 return DL.getABITypeAlign(Ty); 1507 } 1508 1509 const Function *DirectCallee = CB->getCalledFunction(); 1510 1511 if (!DirectCallee) { 1512 // We don't have a direct function symbol, but that may be because of 1513 // constant cast instructions in the call. 1514 1515 // With bitcast'd call targets, the instruction will be the call 1516 if (const auto *CI = dyn_cast<CallInst>(CB)) { 1517 // Check if we have call alignment metadata 1518 if (MaybeAlign StackAlign = getAlign(*CI, Idx)) 1519 return StackAlign.value(); 1520 } 1521 DirectCallee = getMaybeBitcastedCallee(CB); 1522 } 1523 1524 // Check for function alignment information if we found that the 1525 // ultimate target is a Function 1526 if (DirectCallee) 1527 return getFunctionArgumentAlignment(DirectCallee, Ty, Idx, DL); 1528 1529 // Call is indirect, fall back to the ABI type alignment 1530 return DL.getABITypeAlign(Ty); 1531 } 1532 1533 static bool adjustElementType(EVT &ElementType) { 1534 switch (ElementType.getSimpleVT().SimpleTy) { 1535 default: 1536 return false; 1537 case MVT::f16: 1538 case MVT::bf16: 1539 ElementType = MVT::i16; 1540 return true; 1541 case MVT::f32: 1542 case MVT::v2f16: 1543 case MVT::v2bf16: 1544 ElementType = MVT::i32; 1545 return true; 1546 case MVT::f64: 1547 ElementType = MVT::i64; 1548 return true; 1549 } 1550 } 1551 1552 // Use byte-store when the param address of the argument value is unaligned. 1553 // This may happen when the return value is a field of a packed structure. 1554 // 1555 // This is called in LowerCall() when passing the param values. 1556 static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, 1557 uint64_t Offset, EVT ElementType, 1558 SDValue StVal, SDValue &InGlue, 1559 unsigned ArgID, const SDLoc &dl) { 1560 // Bit logic only works on integer types 1561 if (adjustElementType(ElementType)) 1562 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); 1563 1564 // Store each byte 1565 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1566 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 1567 // Shift the byte to the last byte position 1568 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, 1569 DAG.getConstant(i * 8, dl, MVT::i32)); 1570 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), 1571 DAG.getConstant(Offset + i, dl, MVT::i32), 1572 ShiftVal, InGlue}; 1573 // Trunc store only the last byte by using 1574 // st.param.b8 1575 // The register type can be larger than b8. 1576 Chain = DAG.getMemIntrinsicNode( 1577 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, 1578 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 1579 InGlue = Chain.getValue(1); 1580 } 1581 return Chain; 1582 } 1583 1584 // Use byte-load when the param adress of the returned value is unaligned. 1585 // This may happen when the returned value is a field of a packed structure. 1586 static SDValue 1587 LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, 1588 EVT ElementType, SDValue &InGlue, 1589 SmallVectorImpl<SDValue> &TempProxyRegOps, 1590 const SDLoc &dl) { 1591 // Bit logic only works on integer types 1592 EVT MergedType = ElementType; 1593 adjustElementType(MergedType); 1594 1595 // Load each byte and construct the whole value. Initial value to 0 1596 SDValue RetVal = DAG.getConstant(0, dl, MergedType); 1597 // LoadParamMemI8 loads into i16 register only 1598 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); 1599 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 1600 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), 1601 DAG.getConstant(Offset + i, dl, MVT::i32), 1602 InGlue}; 1603 // This will be selected to LoadParamMemI8 1604 SDValue LdVal = 1605 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, 1606 MVT::i8, MachinePointerInfo(), Align(1)); 1607 SDValue TmpLdVal = LdVal.getValue(0); 1608 Chain = LdVal.getValue(1); 1609 InGlue = LdVal.getValue(2); 1610 1611 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, 1612 TmpLdVal.getSimpleValueType(), TmpLdVal); 1613 TempProxyRegOps.push_back(TmpLdVal); 1614 1615 SDValue CMask = DAG.getConstant(255, dl, MergedType); 1616 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); 1617 // Need to extend the i16 register to the whole width. 1618 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); 1619 // Mask off the high bits. Leave only the lower 8bits. 1620 // Do this because we are using loadparam.b8. 1621 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); 1622 // Shift and merge 1623 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); 1624 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); 1625 } 1626 if (ElementType != MergedType) 1627 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); 1628 1629 return RetVal; 1630 } 1631 1632 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 1633 SmallVectorImpl<SDValue> &InVals) const { 1634 1635 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) 1636 report_fatal_error( 1637 "Support for variadic functions (unsized array parameter) introduced " 1638 "in PTX ISA version 6.0 and requires target sm_30."); 1639 1640 SelectionDAG &DAG = CLI.DAG; 1641 SDLoc dl = CLI.DL; 1642 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 1643 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 1644 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 1645 SDValue Chain = CLI.Chain; 1646 SDValue Callee = CLI.Callee; 1647 bool &isTailCall = CLI.IsTailCall; 1648 ArgListTy &Args = CLI.getArgs(); 1649 Type *RetTy = CLI.RetTy; 1650 const CallBase *CB = CLI.CB; 1651 const DataLayout &DL = DAG.getDataLayout(); 1652 1653 bool isABI = (STI.getSmVersion() >= 20); 1654 assert(isABI && "Non-ABI compilation is not supported"); 1655 if (!isABI) 1656 return Chain; 1657 1658 // Variadic arguments. 1659 // 1660 // Normally, for each argument, we declare a param scalar or a param 1661 // byte array in the .param space, and store the argument value to that 1662 // param scalar or array starting at offset 0. 1663 // 1664 // In the case of the first variadic argument, we declare a vararg byte array 1665 // with size 0. The exact size of this array isn't known at this point, so 1666 // it'll be patched later. All the variadic arguments will be stored to this 1667 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1668 // initially set to 0, so it can be used for non-variadic arguments (which use 1669 // 0 offset) to simplify the code. 1670 // 1671 // After all vararg is processed, 'VAOffset' holds the size of the 1672 // vararg byte array. 1673 1674 SDValue VADeclareParam; // vararg byte array 1675 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic 1676 unsigned VAOffset = 0; // current offset in the param array 1677 1678 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(1); 1679 SDValue TempChain = Chain; 1680 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); 1681 SDValue InGlue = Chain.getValue(1); 1682 1683 unsigned ParamCount = 0; 1684 // Args.size() and Outs.size() need not match. 1685 // Outs.size() will be larger 1686 // * if there is an aggregate argument with multiple fields (each field 1687 // showing up separately in Outs) 1688 // * if there is a vector argument with more than typical vector-length 1689 // elements (generally if more than 4) where each vector element is 1690 // individually present in Outs. 1691 // So a different index should be used for indexing into Outs/OutVals. 1692 // See similar issue in LowerFormalArguments. 1693 unsigned OIdx = 0; 1694 // Declare the .params or .reg need to pass values 1695 // to the function 1696 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { 1697 EVT VT = Outs[OIdx].VT; 1698 Type *Ty = Args[i].Ty; 1699 bool IsVAArg = (i >= CLI.NumFixedArgs); 1700 bool IsByVal = Outs[OIdx].Flags.isByVal(); 1701 1702 SmallVector<EVT, 16> VTs; 1703 SmallVector<uint64_t, 16> Offsets; 1704 1705 assert((!IsByVal || Args[i].IndirectType) && 1706 "byval arg must have indirect type"); 1707 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); 1708 ComputePTXValueVTs(*this, DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset); 1709 1710 Align ArgAlign; 1711 if (IsByVal) { 1712 // The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1713 // so we don't need to worry whether it's naturally aligned or not. 1714 // See TargetLowering::LowerCallTo(). 1715 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); 1716 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, 1717 InitialAlign, DL); 1718 if (IsVAArg) 1719 VAOffset = alignTo(VAOffset, ArgAlign); 1720 } else { 1721 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL); 1722 } 1723 1724 unsigned TypeSize = 1725 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); 1726 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1727 1728 bool NeedAlign; // Does argument declaration specify alignment? 1729 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); 1730 if (IsVAArg) { 1731 if (ParamCount == FirstVAArg) { 1732 SDValue DeclareParamOps[] = { 1733 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), 1734 DAG.getConstant(ParamCount, dl, MVT::i32), 1735 DAG.getConstant(1, dl, MVT::i32), InGlue}; 1736 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, 1737 DeclareParamVTs, DeclareParamOps); 1738 } 1739 NeedAlign = PassAsArray; 1740 } else if (PassAsArray) { 1741 // declare .param .align <align> .b8 .param<n>[<size>]; 1742 SDValue DeclareParamOps[] = { 1743 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), 1744 DAG.getConstant(ParamCount, dl, MVT::i32), 1745 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; 1746 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, 1747 DeclareParamOps); 1748 NeedAlign = true; 1749 } else { 1750 // declare .param .b<size> .param<n>; 1751 if (VT.isInteger() || VT.isFloatingPoint()) { 1752 // PTX ABI requires integral types to be at least 32 bits in 1753 // size. FP16 is loaded/stored using i16, so it's handled 1754 // here as well. 1755 TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8; 1756 } 1757 SDValue DeclareScalarParamOps[] = { 1758 Chain, DAG.getConstant(ParamCount, dl, MVT::i32), 1759 DAG.getConstant(TypeSize * 8, dl, MVT::i32), 1760 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1761 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, 1762 DeclareScalarParamOps); 1763 NeedAlign = false; 1764 } 1765 InGlue = Chain.getValue(1); 1766 1767 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1768 // than 32-bits are sign extended or zero extended, depending on 1769 // whether they are signed or unsigned types. This case applies 1770 // only to scalar parameters and not to aggregate values. 1771 bool ExtendIntegerParam = 1772 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; 1773 1774 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg); 1775 SmallVector<SDValue, 6> StoreOperands; 1776 for (unsigned j = 0, je = VTs.size(); j != je; ++j) { 1777 EVT EltVT = VTs[j]; 1778 int CurOffset = Offsets[j]; 1779 MaybeAlign PartAlign; 1780 if (NeedAlign) 1781 PartAlign = commonAlignment(ArgAlign, CurOffset); 1782 1783 SDValue StVal = OutVals[OIdx]; 1784 1785 MVT PromotedVT; 1786 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 1787 EltVT = EVT(PromotedVT); 1788 } 1789 if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) { 1790 llvm::ISD::NodeType Ext = 1791 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1792 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal); 1793 } 1794 1795 if (IsByVal) { 1796 auto PtrVT = getPointerTy(DL); 1797 SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal, 1798 DAG.getConstant(CurOffset, dl, PtrVT)); 1799 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr, MachinePointerInfo(), 1800 PartAlign); 1801 } else if (ExtendIntegerParam) { 1802 assert(VTs.size() == 1 && "Scalar can't have multiple parts."); 1803 // zext/sext to i32 1804 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 1805 : ISD::ZERO_EXTEND, 1806 dl, MVT::i32, StVal); 1807 } 1808 1809 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { 1810 // Use 16-bit registers for small stores as it's the 1811 // smallest general purpose register size supported by NVPTX. 1812 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); 1813 } 1814 1815 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 1816 // scalar store. In such cases, fall back to byte stores. 1817 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && 1818 PartAlign.value() < 1819 DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) { 1820 assert(StoreOperands.empty() && "Unfinished preceeding store."); 1821 Chain = LowerUnalignedStoreParam( 1822 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT, 1823 StVal, InGlue, ParamCount, dl); 1824 1825 // LowerUnalignedStoreParam took care of inserting the necessary nodes 1826 // into the SDAG, so just move on to the next element. 1827 if (!IsByVal) 1828 ++OIdx; 1829 continue; 1830 } 1831 1832 // New store. 1833 if (VectorInfo[j] & PVF_FIRST) { 1834 assert(StoreOperands.empty() && "Unfinished preceding store."); 1835 StoreOperands.push_back(Chain); 1836 StoreOperands.push_back( 1837 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); 1838 1839 StoreOperands.push_back(DAG.getConstant( 1840 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), 1841 dl, MVT::i32)); 1842 } 1843 1844 // Record the value to store. 1845 StoreOperands.push_back(StVal); 1846 1847 if (VectorInfo[j] & PVF_LAST) { 1848 unsigned NumElts = StoreOperands.size() - 3; 1849 NVPTXISD::NodeType Op; 1850 switch (NumElts) { 1851 case 1: 1852 Op = NVPTXISD::StoreParam; 1853 break; 1854 case 2: 1855 Op = NVPTXISD::StoreParamV2; 1856 break; 1857 case 4: 1858 Op = NVPTXISD::StoreParamV4; 1859 break; 1860 default: 1861 llvm_unreachable("Invalid vector info."); 1862 } 1863 1864 StoreOperands.push_back(InGlue); 1865 1866 // Adjust type of the store op if we've extended the scalar 1867 // return value. 1868 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; 1869 1870 Chain = DAG.getMemIntrinsicNode( 1871 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, 1872 TheStoreType, MachinePointerInfo(), PartAlign, 1873 MachineMemOperand::MOStore); 1874 InGlue = Chain.getValue(1); 1875 1876 // Cleanup. 1877 StoreOperands.clear(); 1878 1879 // TODO: We may need to support vector types that can be passed 1880 // as scalars in variadic arguments. 1881 if (!IsByVal && IsVAArg) { 1882 assert(NumElts == 1 && 1883 "Vectorization is expected to be disabled for variadics."); 1884 VAOffset += DL.getTypeAllocSize( 1885 TheStoreType.getTypeForEVT(*DAG.getContext())); 1886 } 1887 } 1888 if (!IsByVal) 1889 ++OIdx; 1890 } 1891 assert(StoreOperands.empty() && "Unfinished parameter store."); 1892 if (!IsByVal && VTs.size() > 0) 1893 --OIdx; 1894 ++ParamCount; 1895 if (IsByVal && IsVAArg) 1896 VAOffset += TypeSize; 1897 } 1898 1899 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); 1900 MaybeAlign retAlignment = std::nullopt; 1901 1902 // Handle Result 1903 if (Ins.size() > 0) { 1904 SmallVector<EVT, 16> resvtparts; 1905 ComputeValueVTs(*this, DL, RetTy, resvtparts); 1906 1907 // Declare 1908 // .param .align N .b8 retval0[<size-in-bytes>], or 1909 // .param .b<size-in-bits> retval0 1910 unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy); 1911 if (!IsTypePassedAsArray(RetTy)) { 1912 resultsz = promoteScalarArgumentSize(resultsz); 1913 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1914 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 1915 DAG.getConstant(resultsz, dl, MVT::i32), 1916 DAG.getConstant(0, dl, MVT::i32), InGlue }; 1917 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, 1918 DeclareRetOps); 1919 InGlue = Chain.getValue(1); 1920 } else { 1921 retAlignment = getArgumentAlignment(CB, RetTy, 0, DL); 1922 assert(retAlignment && "retAlignment is guaranteed to be set"); 1923 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1924 SDValue DeclareRetOps[] = { 1925 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), 1926 DAG.getConstant(resultsz / 8, dl, MVT::i32), 1927 DAG.getConstant(0, dl, MVT::i32), InGlue}; 1928 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, 1929 DeclareRetOps); 1930 InGlue = Chain.getValue(1); 1931 } 1932 } 1933 1934 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); 1935 // Set the size of the vararg param byte array if the callee is a variadic 1936 // function and the variadic part is not empty. 1937 if (HasVAArgs) { 1938 SDValue DeclareParamOps[] = { 1939 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), 1940 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), 1941 VADeclareParam.getOperand(4)}; 1942 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), 1943 VADeclareParam->getVTList(), DeclareParamOps); 1944 } 1945 1946 // Both indirect calls and libcalls have nullptr Func. In order to distinguish 1947 // between them we must rely on the call site value which is valid for 1948 // indirect calls but is always null for libcalls. 1949 bool isIndirectCall = !Func && CB; 1950 1951 if (isa<ExternalSymbolSDNode>(Callee)) { 1952 Function* CalleeFunc = nullptr; 1953 1954 // Try to find the callee in the current module. 1955 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc); 1956 assert(CalleeFunc != nullptr && "Libcall callee must be set."); 1957 1958 // Set the "libcall callee" attribute to indicate that the function 1959 // must always have a declaration. 1960 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); 1961 } 1962 1963 if (isIndirectCall) { 1964 // This is indirect function call case : PTX requires a prototype of the 1965 // form 1966 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1967 // to be emitted, and the label has to used as the last arg of call 1968 // instruction. 1969 // The prototype is embedded in a string and put as the operand for a 1970 // CallPrototype SDNode which will print out to the value of the string. 1971 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1972 std::string Proto = getPrototype( 1973 DL, RetTy, Args, Outs, retAlignment, 1974 HasVAArgs 1975 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( 1976 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1))) 1977 : std::nullopt, 1978 *CB, UniqueCallSite); 1979 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); 1980 SDValue ProtoOps[] = { 1981 Chain, 1982 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), 1983 InGlue, 1984 }; 1985 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); 1986 InGlue = Chain.getValue(1); 1987 } 1988 // Op to just print "call" 1989 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); 1990 SDValue PrintCallOps[] = { 1991 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue 1992 }; 1993 // We model convergent calls as separate opcodes. 1994 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; 1995 if (CLI.IsConvergent) 1996 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni 1997 : NVPTXISD::PrintConvergentCall; 1998 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); 1999 InGlue = Chain.getValue(1); 2000 2001 // Ops to print out the function name 2002 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2003 SDValue CallVoidOps[] = { Chain, Callee, InGlue }; 2004 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps); 2005 InGlue = Chain.getValue(1); 2006 2007 // Ops to print out the param list 2008 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2009 SDValue CallArgBeginOps[] = { Chain, InGlue }; 2010 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs, 2011 CallArgBeginOps); 2012 InGlue = Chain.getValue(1); 2013 2014 for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e; 2015 ++i) { 2016 unsigned opcode; 2017 if (i == (e - 1)) 2018 opcode = NVPTXISD::LastCallArg; 2019 else 2020 opcode = NVPTXISD::CallArg; 2021 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2022 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), 2023 DAG.getConstant(i, dl, MVT::i32), InGlue }; 2024 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); 2025 InGlue = Chain.getValue(1); 2026 } 2027 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2028 SDValue CallArgEndOps[] = { Chain, 2029 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), 2030 InGlue }; 2031 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); 2032 InGlue = Chain.getValue(1); 2033 2034 if (isIndirectCall) { 2035 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); 2036 SDValue PrototypeOps[] = { 2037 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; 2038 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); 2039 InGlue = Chain.getValue(1); 2040 } 2041 2042 SmallVector<SDValue, 16> ProxyRegOps; 2043 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; 2044 // An item of the vector is filled if the element does not need a ProxyReg 2045 // operation on it and should be added to InVals as is. ProxyRegOps and 2046 // ProxyRegTruncates contain empty/none items at the same index. 2047 SmallVector<SDValue, 16> RetElts; 2048 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` 2049 // to use the values of `LoadParam`s and to be replaced later then 2050 // `CALLSEQ_END` is added. 2051 SmallVector<SDValue, 16> TempProxyRegOps; 2052 2053 // Generate loads from param memory/moves from registers for result 2054 if (Ins.size() > 0) { 2055 SmallVector<EVT, 16> VTs; 2056 SmallVector<uint64_t, 16> Offsets; 2057 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0); 2058 assert(VTs.size() == Ins.size() && "Bad value decomposition"); 2059 2060 Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); 2061 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); 2062 2063 SmallVector<EVT, 6> LoadVTs; 2064 int VecIdx = -1; // Index of the first element of the vector. 2065 2066 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 2067 // 32-bits are sign extended or zero extended, depending on whether 2068 // they are signed or unsigned types. 2069 bool ExtendIntegerRetVal = 2070 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 2071 2072 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 2073 bool needTruncate = false; 2074 EVT TheLoadType = VTs[i]; 2075 EVT EltType = Ins[i].VT; 2076 Align EltAlign = commonAlignment(RetAlign, Offsets[i]); 2077 MVT PromotedVT; 2078 2079 if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) { 2080 TheLoadType = EVT(PromotedVT); 2081 EltType = EVT(PromotedVT); 2082 needTruncate = true; 2083 } 2084 2085 if (ExtendIntegerRetVal) { 2086 TheLoadType = MVT::i32; 2087 EltType = MVT::i32; 2088 needTruncate = true; 2089 } else if (TheLoadType.getSizeInBits() < 16) { 2090 if (VTs[i].isInteger()) 2091 needTruncate = true; 2092 EltType = MVT::i16; 2093 } 2094 2095 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 2096 // scalar load. In such cases, fall back to byte loads. 2097 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && 2098 EltAlign < DL.getABITypeAlign( 2099 TheLoadType.getTypeForEVT(*DAG.getContext()))) { 2100 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2101 SDValue Ret = LowerUnalignedLoadRetParam( 2102 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl); 2103 ProxyRegOps.push_back(SDValue()); 2104 ProxyRegTruncates.push_back(std::optional<MVT>()); 2105 RetElts.resize(i); 2106 RetElts.push_back(Ret); 2107 2108 continue; 2109 } 2110 2111 // Record index of the very first element of the vector. 2112 if (VectorInfo[i] & PVF_FIRST) { 2113 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); 2114 VecIdx = i; 2115 } 2116 2117 LoadVTs.push_back(EltType); 2118 2119 if (VectorInfo[i] & PVF_LAST) { 2120 unsigned NumElts = LoadVTs.size(); 2121 LoadVTs.push_back(MVT::Other); 2122 LoadVTs.push_back(MVT::Glue); 2123 NVPTXISD::NodeType Op; 2124 switch (NumElts) { 2125 case 1: 2126 Op = NVPTXISD::LoadParam; 2127 break; 2128 case 2: 2129 Op = NVPTXISD::LoadParamV2; 2130 break; 2131 case 4: 2132 Op = NVPTXISD::LoadParamV4; 2133 break; 2134 default: 2135 llvm_unreachable("Invalid vector info."); 2136 } 2137 2138 SDValue LoadOperands[] = { 2139 Chain, DAG.getConstant(1, dl, MVT::i32), 2140 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; 2141 SDValue RetVal = DAG.getMemIntrinsicNode( 2142 Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, 2143 MachinePointerInfo(), EltAlign, 2144 MachineMemOperand::MOLoad); 2145 2146 for (unsigned j = 0; j < NumElts; ++j) { 2147 ProxyRegOps.push_back(RetVal.getValue(j)); 2148 2149 if (needTruncate) 2150 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT)); 2151 else 2152 ProxyRegTruncates.push_back(std::optional<MVT>()); 2153 } 2154 2155 Chain = RetVal.getValue(NumElts); 2156 InGlue = RetVal.getValue(NumElts + 1); 2157 2158 // Cleanup 2159 VecIdx = -1; 2160 LoadVTs.clear(); 2161 } 2162 } 2163 } 2164 2165 Chain = 2166 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); 2167 InGlue = Chain.getValue(1); 2168 2169 // Append ProxyReg instructions to the chain to make sure that `callseq_end` 2170 // will not get lost. Otherwise, during libcalls expansion, the nodes can become 2171 // dangling. 2172 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { 2173 if (i < RetElts.size() && RetElts[i]) { 2174 InVals.push_back(RetElts[i]); 2175 continue; 2176 } 2177 2178 SDValue Ret = DAG.getNode( 2179 NVPTXISD::ProxyReg, dl, 2180 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), 2181 { Chain, ProxyRegOps[i], InGlue } 2182 ); 2183 2184 Chain = Ret.getValue(1); 2185 InGlue = Ret.getValue(2); 2186 2187 if (ProxyRegTruncates[i]) { 2188 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret); 2189 } 2190 2191 InVals.push_back(Ret); 2192 } 2193 2194 for (SDValue &T : TempProxyRegOps) { 2195 SDValue Repl = DAG.getNode( 2196 NVPTXISD::ProxyReg, dl, 2197 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), 2198 {Chain, T.getOperand(0), InGlue}); 2199 DAG.ReplaceAllUsesWith(T, Repl); 2200 DAG.RemoveDeadNode(T.getNode()); 2201 2202 Chain = Repl.getValue(1); 2203 InGlue = Repl.getValue(2); 2204 } 2205 2206 // set isTailCall to false for now, until we figure out how to express 2207 // tail call optimization in PTX 2208 isTailCall = false; 2209 return Chain; 2210 } 2211 2212 SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 2213 SelectionDAG &DAG) const { 2214 2215 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { 2216 const Function &Fn = DAG.getMachineFunction().getFunction(); 2217 2218 DiagnosticInfoUnsupported NoDynamicAlloca( 2219 Fn, 2220 "Support for dynamic alloca introduced in PTX ISA version 7.3 and " 2221 "requires target sm_52.", 2222 SDLoc(Op).getDebugLoc()); 2223 DAG.getContext()->diagnose(NoDynamicAlloca); 2224 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), 2225 Op.getOperand(0)}; 2226 return DAG.getMergeValues(Ops, SDLoc()); 2227 } 2228 2229 SDValue Chain = Op.getOperand(0); 2230 SDValue Size = Op.getOperand(1); 2231 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 2232 SDLoc DL(Op.getNode()); 2233 2234 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. 2235 if (nvTM->is64Bit()) 2236 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64); 2237 else 2238 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32); 2239 2240 SDValue AllocOps[] = {Chain, Size, 2241 DAG.getTargetConstant(Align, DL, MVT::i32)}; 2242 SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, 2243 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps); 2244 2245 SDValue MergeOps[] = {Alloca, Chain}; 2246 return DAG.getMergeValues(MergeOps, DL); 2247 } 2248 2249 // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2250 // (see LegalizeDAG.cpp). This is slow and uses local memory. 2251 // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2252 SDValue 2253 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 2254 SDNode *Node = Op.getNode(); 2255 SDLoc dl(Node); 2256 SmallVector<SDValue, 8> Ops; 2257 unsigned NumOperands = Node->getNumOperands(); 2258 for (unsigned i = 0; i < NumOperands; ++i) { 2259 SDValue SubOp = Node->getOperand(i); 2260 EVT VVT = SubOp.getNode()->getValueType(0); 2261 EVT EltVT = VVT.getVectorElementType(); 2262 unsigned NumSubElem = VVT.getVectorNumElements(); 2263 for (unsigned j = 0; j < NumSubElem; ++j) { 2264 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp, 2265 DAG.getIntPtrConstant(j, dl))); 2266 } 2267 } 2268 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); 2269 } 2270 2271 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2272 // would get lowered as two constant loads and vector-packing move. 2273 // Instead we want just a constant move: 2274 // mov.b32 %r2, 0x40003C00 2275 SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, 2276 SelectionDAG &DAG) const { 2277 EVT VT = Op->getValueType(0); 2278 if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) 2279 return Op; 2280 2281 SDLoc DL(Op); 2282 2283 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { 2284 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || 2285 isa<ConstantFPSDNode>(Operand); 2286 })) { 2287 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2288 // to optimize calculation of constant parts. 2289 if (VT == MVT::v4i8) { 2290 SDValue C8 = DAG.getConstant(8, DL, MVT::i32); 2291 SDValue E01 = DAG.getNode( 2292 NVPTXISD::BFI, DL, MVT::i32, 2293 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), 2294 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); 2295 SDValue E012 = 2296 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2297 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), 2298 E01, DAG.getConstant(16, DL, MVT::i32), C8); 2299 SDValue E0123 = 2300 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2301 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), 2302 E012, DAG.getConstant(24, DL, MVT::i32), C8); 2303 return DAG.getNode(ISD::BITCAST, DL, VT, E0123); 2304 } 2305 return Op; 2306 } 2307 2308 // Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2309 auto GetOperand = [](SDValue Op, int N) -> APInt { 2310 const SDValue &Operand = Op->getOperand(N); 2311 EVT VT = Op->getValueType(0); 2312 if (Operand->isUndef()) 2313 return APInt(32, 0); 2314 APInt Value; 2315 if (VT == MVT::v2f16 || VT == MVT::v2bf16) 2316 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); 2317 else if (VT == MVT::v2i16 || VT == MVT::v4i8) 2318 Value = Operand->getAsAPIntVal(); 2319 else 2320 llvm_unreachable("Unsupported type"); 2321 // i8 values are carried around as i16, so we need to zero out upper bits, 2322 // so they do not get in the way of combining individual byte values 2323 if (VT == MVT::v4i8) 2324 Value = Value.trunc(8); 2325 return Value.zext(32); 2326 }; 2327 APInt Value; 2328 if (Isv2x16VT(VT)) { 2329 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); 2330 } else if (VT == MVT::v4i8) { 2331 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | 2332 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); 2333 } else { 2334 llvm_unreachable("Unsupported type"); 2335 } 2336 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); 2337 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); 2338 } 2339 2340 SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 2341 SelectionDAG &DAG) const { 2342 SDValue Index = Op->getOperand(1); 2343 SDValue Vector = Op->getOperand(0); 2344 SDLoc DL(Op); 2345 EVT VectorVT = Vector.getValueType(); 2346 2347 if (VectorVT == MVT::v4i8) { 2348 SDValue BFE = 2349 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, 2350 {Vector, 2351 DAG.getNode(ISD::MUL, DL, MVT::i32, 2352 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2353 DAG.getConstant(8, DL, MVT::i32)), 2354 DAG.getConstant(8, DL, MVT::i32)}); 2355 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); 2356 } 2357 2358 // Constant index will be matched by tablegen. 2359 if (isa<ConstantSDNode>(Index.getNode())) 2360 return Op; 2361 2362 // Extract individual elements and select one of them. 2363 assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); 2364 EVT EltVT = VectorVT.getVectorElementType(); 2365 2366 SDLoc dl(Op.getNode()); 2367 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2368 DAG.getIntPtrConstant(0, dl)); 2369 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector, 2370 DAG.getIntPtrConstant(1, dl)); 2371 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1, 2372 ISD::CondCode::SETEQ); 2373 } 2374 2375 SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 2376 SelectionDAG &DAG) const { 2377 SDValue Vector = Op->getOperand(0); 2378 EVT VectorVT = Vector.getValueType(); 2379 2380 if (VectorVT != MVT::v4i8) 2381 return Op; 2382 SDLoc DL(Op); 2383 SDValue Value = Op->getOperand(1); 2384 if (Value->isUndef()) 2385 return Vector; 2386 2387 SDValue Index = Op->getOperand(2); 2388 2389 SDValue BFI = 2390 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, 2391 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, 2392 DAG.getNode(ISD::MUL, DL, MVT::i32, 2393 DAG.getZExtOrTrunc(Index, DL, MVT::i32), 2394 DAG.getConstant(8, DL, MVT::i32)), 2395 DAG.getConstant(8, DL, MVT::i32)}); 2396 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); 2397 } 2398 2399 SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 2400 SelectionDAG &DAG) const { 2401 SDValue V1 = Op.getOperand(0); 2402 EVT VectorVT = V1.getValueType(); 2403 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) 2404 return Op; 2405 2406 // Lower shuffle to PRMT instruction. 2407 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 2408 SDValue V2 = Op.getOperand(1); 2409 uint32_t Selector = 0; 2410 for (auto I : llvm::enumerate(SVN->getMask())) { 2411 if (I.value() != -1) // -1 is a placeholder for undef. 2412 Selector |= (I.value() << (I.index() * 4)); 2413 } 2414 2415 SDLoc DL(Op); 2416 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, 2417 DAG.getConstant(Selector, DL, MVT::i32), 2418 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); 2419 } 2420 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2421 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2422 /// amount, or 2423 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2424 /// amount. 2425 SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, 2426 SelectionDAG &DAG) const { 2427 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2428 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 2429 2430 EVT VT = Op.getValueType(); 2431 unsigned VTBits = VT.getSizeInBits(); 2432 SDLoc dl(Op); 2433 SDValue ShOpLo = Op.getOperand(0); 2434 SDValue ShOpHi = Op.getOperand(1); 2435 SDValue ShAmt = Op.getOperand(2); 2436 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 2437 2438 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2439 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2440 // {dHi, dLo} = {aHi, aLo} >> Amt 2441 // dHi = aHi >> Amt 2442 // dLo = shf.r.clamp aLo, aHi, Amt 2443 2444 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2445 SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi, 2446 ShAmt); 2447 2448 SDValue Ops[2] = { Lo, Hi }; 2449 return DAG.getMergeValues(Ops, dl); 2450 } 2451 else { 2452 // {dHi, dLo} = {aHi, aLo} >> Amt 2453 // - if (Amt>=size) then 2454 // dLo = aHi >> (Amt-size) 2455 // dHi = aHi >> Amt (this is either all 0 or all 1) 2456 // else 2457 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2458 // dHi = aHi >> Amt 2459 2460 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2461 DAG.getConstant(VTBits, dl, MVT::i32), 2462 ShAmt); 2463 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 2464 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2465 DAG.getConstant(VTBits, dl, MVT::i32)); 2466 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 2467 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2468 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 2469 2470 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2471 DAG.getConstant(VTBits, dl, MVT::i32), 2472 ISD::SETGE); 2473 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 2474 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2475 2476 SDValue Ops[2] = { Lo, Hi }; 2477 return DAG.getMergeValues(Ops, dl); 2478 } 2479 } 2480 2481 /// LowerShiftLeftParts - Lower SHL_PARTS, which 2482 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2483 /// amount, or 2484 /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2485 /// amount. 2486 SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, 2487 SelectionDAG &DAG) const { 2488 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 2489 assert(Op.getOpcode() == ISD::SHL_PARTS); 2490 2491 EVT VT = Op.getValueType(); 2492 unsigned VTBits = VT.getSizeInBits(); 2493 SDLoc dl(Op); 2494 SDValue ShOpLo = Op.getOperand(0); 2495 SDValue ShOpHi = Op.getOperand(1); 2496 SDValue ShAmt = Op.getOperand(2); 2497 2498 if (VTBits == 32 && STI.getSmVersion() >= 35) { 2499 // For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2500 // {dHi, dLo} = {aHi, aLo} << Amt 2501 // dHi = shf.l.clamp aLo, aHi, Amt 2502 // dLo = aLo << Amt 2503 2504 SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi, 2505 ShAmt); 2506 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2507 2508 SDValue Ops[2] = { Lo, Hi }; 2509 return DAG.getMergeValues(Ops, dl); 2510 } 2511 else { 2512 // {dHi, dLo} = {aHi, aLo} << Amt 2513 // - if (Amt>=size) then 2514 // dLo = aLo << Amt (all 0) 2515 // dLo = aLo << (Amt-size) 2516 // else 2517 // dLo = aLo << Amt 2518 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2519 2520 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 2521 DAG.getConstant(VTBits, dl, MVT::i32), 2522 ShAmt); 2523 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 2524 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 2525 DAG.getConstant(VTBits, dl, MVT::i32)); 2526 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 2527 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 2528 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 2529 2530 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, 2531 DAG.getConstant(VTBits, dl, MVT::i32), 2532 ISD::SETGE); 2533 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 2534 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal); 2535 2536 SDValue Ops[2] = { Lo, Hi }; 2537 return DAG.getMergeValues(Ops, dl); 2538 } 2539 } 2540 2541 SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { 2542 EVT VT = Op.getValueType(); 2543 2544 if (VT == MVT::f32) 2545 return LowerFROUND32(Op, DAG); 2546 2547 if (VT == MVT::f64) 2548 return LowerFROUND64(Op, DAG); 2549 2550 llvm_unreachable("unhandled type"); 2551 } 2552 2553 // This is the the rounding method used in CUDA libdevice in C like code: 2554 // float roundf(float A) 2555 // { 2556 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2557 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2558 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2559 // } 2560 SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, 2561 SelectionDAG &DAG) const { 2562 SDLoc SL(Op); 2563 SDValue A = Op.getOperand(0); 2564 EVT VT = Op.getValueType(); 2565 2566 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2567 2568 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2569 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); 2570 const int SignBitMask = 0x80000000; 2571 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, 2572 DAG.getConstant(SignBitMask, SL, MVT::i32)); 2573 const int PointFiveInBits = 0x3F000000; 2574 SDValue PointFiveWithSignRaw = 2575 DAG.getNode(ISD::OR, SL, MVT::i32, Sign, 2576 DAG.getConstant(PointFiveInBits, SL, MVT::i32)); 2577 SDValue PointFiveWithSign = 2578 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw); 2579 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign); 2580 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2581 2582 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2583 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2584 SDValue IsLarge = 2585 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT), 2586 ISD::SETOGT); 2587 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2588 2589 // return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2590 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2591 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2592 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A); 2593 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA); 2594 } 2595 2596 // The implementation of round(double) is similar to that of round(float) in 2597 // that they both separate the value range into three regions and use a method 2598 // specific to the region to round the values. However, round(double) first 2599 // calculates the round of the absolute value and then adds the sign back while 2600 // round(float) directly rounds the value with sign. 2601 SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, 2602 SelectionDAG &DAG) const { 2603 SDLoc SL(Op); 2604 SDValue A = Op.getOperand(0); 2605 EVT VT = Op.getValueType(); 2606 2607 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A); 2608 2609 // double RoundedA = (double) (int) (abs(A) + 0.5f); 2610 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA, 2611 DAG.getConstantFP(0.5, SL, VT)); 2612 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA); 2613 2614 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2615 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); 2616 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA, 2617 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT); 2618 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall, 2619 DAG.getConstantFP(0, SL, VT), 2620 RoundedA); 2621 2622 // Add sign to rounded_A 2623 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A); 2624 DAG.getNode(ISD::FTRUNC, SL, VT, A); 2625 2626 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2627 SDValue IsLarge = 2628 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT), 2629 ISD::SETOGT); 2630 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA); 2631 } 2632 2633 SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, 2634 SelectionDAG &DAG) const { 2635 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2636 2637 if (Op.getValueType() == MVT::bf16) { 2638 SDLoc Loc(Op); 2639 return DAG.getNode( 2640 ISD::FP_ROUND, Loc, MVT::bf16, 2641 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), 2642 DAG.getIntPtrConstant(0, Loc)); 2643 } 2644 2645 // Everything else is considered legal. 2646 return Op; 2647 } 2648 2649 SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, 2650 SelectionDAG &DAG) const { 2651 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); 2652 2653 if (Op.getOperand(0).getValueType() == MVT::bf16) { 2654 SDLoc Loc(Op); 2655 return DAG.getNode( 2656 Op.getOpcode(), Loc, Op.getValueType(), 2657 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); 2658 } 2659 2660 // Everything else is considered legal. 2661 return Op; 2662 } 2663 2664 SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op, 2665 SelectionDAG &DAG) const { 2666 EVT NarrowVT = Op.getValueType(); 2667 SDValue Wide = Op.getOperand(0); 2668 EVT WideVT = Wide.getValueType(); 2669 if (NarrowVT.getScalarType() == MVT::bf16) { 2670 const TargetLowering *TLI = STI.getTargetLowering(); 2671 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) { 2672 return TLI->expandFP_ROUND(Op.getNode(), DAG); 2673 } 2674 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { 2675 // This combination was the first to support f32 -> bf16. 2676 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) { 2677 if (WideVT.getScalarType() == MVT::f32) { 2678 return Op; 2679 } 2680 if (WideVT.getScalarType() == MVT::f64) { 2681 SDLoc Loc(Op); 2682 // Round-inexact-to-odd f64 to f32, then do the final rounding using 2683 // the hardware f32 -> bf16 instruction. 2684 SDValue rod = TLI->expandRoundInexactToOdd( 2685 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32) 2686 : MVT::f32, 2687 Wide, Loc, DAG); 2688 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT); 2689 } 2690 } 2691 return TLI->expandFP_ROUND(Op.getNode(), DAG); 2692 } 2693 } 2694 2695 // Everything else is considered legal. 2696 return Op; 2697 } 2698 2699 SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, 2700 SelectionDAG &DAG) const { 2701 SDValue Narrow = Op.getOperand(0); 2702 EVT NarrowVT = Narrow.getValueType(); 2703 EVT WideVT = Op.getValueType(); 2704 if (NarrowVT.getScalarType() == MVT::bf16) { 2705 if (WideVT.getScalarType() == MVT::f32 && 2706 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) { 2707 SDLoc Loc(Op); 2708 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow); 2709 } 2710 if (WideVT.getScalarType() == MVT::f64 && 2711 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { 2712 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32) 2713 : MVT::f32; 2714 SDLoc Loc(Op); 2715 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { 2716 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow); 2717 } else { 2718 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow); 2719 } 2720 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op); 2721 } 2722 } 2723 2724 // Everything else is considered legal. 2725 return Op; 2726 } 2727 2728 static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { 2729 SDLoc DL(Op); 2730 if (Op.getValueType() != MVT::v2i16) 2731 return Op; 2732 EVT EltVT = Op.getValueType().getVectorElementType(); 2733 SmallVector<SDValue> VecElements; 2734 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { 2735 SmallVector<SDValue> ScalarArgs; 2736 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), 2737 [&](const SDUse &O) { 2738 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 2739 O.get(), DAG.getIntPtrConstant(I, DL)); 2740 }); 2741 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs)); 2742 } 2743 SDValue V = 2744 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements); 2745 return V; 2746 } 2747 2748 SDValue 2749 NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 2750 switch (Op.getOpcode()) { 2751 case ISD::RETURNADDR: 2752 return SDValue(); 2753 case ISD::FRAMEADDR: 2754 return SDValue(); 2755 case ISD::GlobalAddress: 2756 return LowerGlobalAddress(Op, DAG); 2757 case ISD::INTRINSIC_W_CHAIN: 2758 return Op; 2759 case ISD::BUILD_VECTOR: 2760 return LowerBUILD_VECTOR(Op, DAG); 2761 case ISD::EXTRACT_SUBVECTOR: 2762 return Op; 2763 case ISD::EXTRACT_VECTOR_ELT: 2764 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 2765 case ISD::INSERT_VECTOR_ELT: 2766 return LowerINSERT_VECTOR_ELT(Op, DAG); 2767 case ISD::VECTOR_SHUFFLE: 2768 return LowerVECTOR_SHUFFLE(Op, DAG); 2769 case ISD::CONCAT_VECTORS: 2770 return LowerCONCAT_VECTORS(Op, DAG); 2771 case ISD::STORE: 2772 return LowerSTORE(Op, DAG); 2773 case ISD::LOAD: 2774 return LowerLOAD(Op, DAG); 2775 case ISD::SHL_PARTS: 2776 return LowerShiftLeftParts(Op, DAG); 2777 case ISD::SRA_PARTS: 2778 case ISD::SRL_PARTS: 2779 return LowerShiftRightParts(Op, DAG); 2780 case ISD::SELECT: 2781 return LowerSelect(Op, DAG); 2782 case ISD::FROUND: 2783 return LowerFROUND(Op, DAG); 2784 case ISD::SINT_TO_FP: 2785 case ISD::UINT_TO_FP: 2786 return LowerINT_TO_FP(Op, DAG); 2787 case ISD::FP_TO_SINT: 2788 case ISD::FP_TO_UINT: 2789 return LowerFP_TO_INT(Op, DAG); 2790 case ISD::FP_ROUND: 2791 return LowerFP_ROUND(Op, DAG); 2792 case ISD::FP_EXTEND: 2793 return LowerFP_EXTEND(Op, DAG); 2794 case ISD::VAARG: 2795 return LowerVAARG(Op, DAG); 2796 case ISD::VASTART: 2797 return LowerVASTART(Op, DAG); 2798 case ISD::ABS: 2799 case ISD::SMIN: 2800 case ISD::SMAX: 2801 case ISD::UMIN: 2802 case ISD::UMAX: 2803 case ISD::ADD: 2804 case ISD::SUB: 2805 case ISD::MUL: 2806 case ISD::SHL: 2807 case ISD::SREM: 2808 case ISD::UREM: 2809 return LowerVectorArith(Op, DAG); 2810 case ISD::DYNAMIC_STACKALLOC: 2811 return LowerDYNAMIC_STACKALLOC(Op, DAG); 2812 case ISD::CopyToReg: 2813 return LowerCopyToReg_128(Op, DAG); 2814 default: 2815 llvm_unreachable("Custom lowering not defined for operation"); 2816 } 2817 } 2818 2819 // This function is almost a copy of SelectionDAG::expandVAArg(). 2820 // The only diff is that this one produces loads from local address space. 2821 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 2822 const TargetLowering *TLI = STI.getTargetLowering(); 2823 SDLoc DL(Op); 2824 2825 SDNode *Node = Op.getNode(); 2826 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); 2827 EVT VT = Node->getValueType(0); 2828 auto *Ty = VT.getTypeForEVT(*DAG.getContext()); 2829 SDValue Tmp1 = Node->getOperand(0); 2830 SDValue Tmp2 = Node->getOperand(1); 2831 const MaybeAlign MA(Node->getConstantOperandVal(3)); 2832 2833 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, 2834 Tmp1, Tmp2, MachinePointerInfo(V)); 2835 SDValue VAList = VAListLoad; 2836 2837 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { 2838 VAList = DAG.getNode( 2839 ISD::ADD, DL, VAList.getValueType(), VAList, 2840 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); 2841 2842 VAList = DAG.getNode( 2843 ISD::AND, DL, VAList.getValueType(), VAList, 2844 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); 2845 } 2846 2847 // Increment the pointer, VAList, to the next vaarg 2848 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList, 2849 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty), 2850 DL, VAList.getValueType())); 2851 2852 // Store the incremented VAList to the legalized pointer 2853 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2, 2854 MachinePointerInfo(V)); 2855 2856 const Value *SrcV = 2857 Constant::getNullValue(PointerType::get(Ty, ADDRESS_SPACE_LOCAL)); 2858 2859 // Load the actual argument out of the pointer VAList 2860 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV)); 2861 } 2862 2863 SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 2864 const TargetLowering *TLI = STI.getTargetLowering(); 2865 SDLoc DL(Op); 2866 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); 2867 2868 // Store the address of unsized array <function>_vararg[] in the ap object. 2869 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); 2870 SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper, DL, PtrVT, Arg); 2871 2872 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 2873 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1), 2874 MachinePointerInfo(SV)); 2875 } 2876 2877 SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { 2878 SDValue Op0 = Op->getOperand(0); 2879 SDValue Op1 = Op->getOperand(1); 2880 SDValue Op2 = Op->getOperand(2); 2881 SDLoc DL(Op.getNode()); 2882 2883 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1"); 2884 2885 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); 2886 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); 2887 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); 2888 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); 2889 2890 return Trunc; 2891 } 2892 2893 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 2894 if (Op.getValueType() == MVT::i1) 2895 return LowerLOADi1(Op, DAG); 2896 2897 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2898 // unaligned loads and have to handle it here. 2899 EVT VT = Op.getValueType(); 2900 if (Isv2x16VT(VT) || VT == MVT::v4i8) { 2901 LoadSDNode *Load = cast<LoadSDNode>(Op); 2902 EVT MemVT = Load->getMemoryVT(); 2903 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2904 MemVT, *Load->getMemOperand())) { 2905 SDValue Ops[2]; 2906 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); 2907 return DAG.getMergeValues(Ops, SDLoc(Op)); 2908 } 2909 } 2910 2911 return SDValue(); 2912 } 2913 2914 // v = ld i1* addr 2915 // => 2916 // v1 = ld i8* addr (-> i16) 2917 // v = trunc i16 to i1 2918 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { 2919 SDNode *Node = Op.getNode(); 2920 LoadSDNode *LD = cast<LoadSDNode>(Node); 2921 SDLoc dl(Node); 2922 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); 2923 assert(Node->getValueType(0) == MVT::i1 && 2924 "Custom lowering for i1 load only"); 2925 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), 2926 LD->getBasePtr(), LD->getPointerInfo(), 2927 MVT::i8, LD->getAlign(), 2928 LD->getMemOperand()->getFlags()); 2929 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); 2930 // The legalizer (the caller) is expecting two values from the legalized 2931 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2932 // in LegalizeDAG.cpp which also uses MergeValues. 2933 SDValue Ops[] = { result, LD->getChain() }; 2934 return DAG.getMergeValues(Ops, dl); 2935 } 2936 2937 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 2938 StoreSDNode *Store = cast<StoreSDNode>(Op); 2939 EVT VT = Store->getMemoryVT(); 2940 2941 if (VT == MVT::i1) 2942 return LowerSTOREi1(Op, DAG); 2943 2944 // v2f16 is legal, so we can't rely on legalizer to handle unaligned 2945 // stores and have to handle it here. 2946 if ((Isv2x16VT(VT) || VT == MVT::v4i8) && 2947 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), 2948 VT, *Store->getMemOperand())) 2949 return expandUnalignedStore(Store, DAG); 2950 2951 // v2f16, v2bf16 and v2i16 don't need special handling. 2952 if (Isv2x16VT(VT) || VT == MVT::v4i8) 2953 return SDValue(); 2954 2955 if (VT.isVector()) 2956 return LowerSTOREVector(Op, DAG); 2957 2958 return SDValue(); 2959 } 2960 2961 SDValue 2962 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { 2963 SDNode *N = Op.getNode(); 2964 SDValue Val = N->getOperand(1); 2965 SDLoc DL(N); 2966 EVT ValVT = Val.getValueType(); 2967 2968 if (ValVT.isVector()) { 2969 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 2970 // legal. We can (and should) split that into 2 stores of <2 x double> here 2971 // but I'm leaving that as a TODO for now. 2972 if (!ValVT.isSimple()) 2973 return SDValue(); 2974 switch (ValVT.getSimpleVT().SimpleTy) { 2975 default: 2976 return SDValue(); 2977 case MVT::v2i8: 2978 case MVT::v2i16: 2979 case MVT::v2i32: 2980 case MVT::v2i64: 2981 case MVT::v2f16: 2982 case MVT::v2bf16: 2983 case MVT::v2f32: 2984 case MVT::v2f64: 2985 case MVT::v4i8: 2986 case MVT::v4i16: 2987 case MVT::v4i32: 2988 case MVT::v4f16: 2989 case MVT::v4bf16: 2990 case MVT::v4f32: 2991 case MVT::v8f16: // <4 x f16x2> 2992 case MVT::v8bf16: // <4 x bf16x2> 2993 case MVT::v8i16: // <4 x i16x2> 2994 // This is a "native" vector type 2995 break; 2996 } 2997 2998 MemSDNode *MemSD = cast<MemSDNode>(N); 2999 const DataLayout &TD = DAG.getDataLayout(); 3000 3001 Align Alignment = MemSD->getAlign(); 3002 Align PrefAlign = 3003 TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); 3004 if (Alignment < PrefAlign) { 3005 // This store is not sufficiently aligned, so bail out and let this vector 3006 // store be scalarized. Note that we may still be able to emit smaller 3007 // vector stores. For example, if we are storing a <4 x float> with an 3008 // alignment of 8, this check will fail but the legalizer will try again 3009 // with 2 x <2 x float>, which will succeed with an alignment of 8. 3010 return SDValue(); 3011 } 3012 3013 unsigned Opcode = 0; 3014 EVT EltVT = ValVT.getVectorElementType(); 3015 unsigned NumElts = ValVT.getVectorNumElements(); 3016 3017 // Since StoreV2 is a target node, we cannot rely on DAG type legalization. 3018 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 3019 // stored type to i16 and propagate the "real" type as the memory type. 3020 bool NeedExt = false; 3021 if (EltVT.getSizeInBits() < 16) 3022 NeedExt = true; 3023 3024 bool StoreF16x2 = false; 3025 switch (NumElts) { 3026 default: 3027 return SDValue(); 3028 case 2: 3029 Opcode = NVPTXISD::StoreV2; 3030 break; 3031 case 4: 3032 Opcode = NVPTXISD::StoreV4; 3033 break; 3034 case 8: 3035 // v8f16 is a special case. PTX doesn't have st.v8.f16 3036 // instruction. Instead, we split the vector into v2f16 chunks and 3037 // store them with st.v4.b32. 3038 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); 3039 Opcode = NVPTXISD::StoreV4; 3040 StoreF16x2 = true; 3041 break; 3042 } 3043 3044 SmallVector<SDValue, 8> Ops; 3045 3046 // First is the chain 3047 Ops.push_back(N->getOperand(0)); 3048 3049 if (StoreF16x2) { 3050 // Combine f16,f16 -> v2f16 3051 NumElts /= 2; 3052 for (unsigned i = 0; i < NumElts; ++i) { 3053 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3054 DAG.getIntPtrConstant(i * 2, DL)); 3055 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3056 DAG.getIntPtrConstant(i * 2 + 1, DL)); 3057 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); 3058 SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); 3059 Ops.push_back(V2); 3060 } 3061 } else { 3062 // Then the split values 3063 for (unsigned i = 0; i < NumElts; ++i) { 3064 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, 3065 DAG.getIntPtrConstant(i, DL)); 3066 if (NeedExt) 3067 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); 3068 Ops.push_back(ExtVal); 3069 } 3070 } 3071 3072 // Then any remaining arguments 3073 Ops.append(N->op_begin() + 2, N->op_end()); 3074 3075 SDValue NewSt = 3076 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, 3077 MemSD->getMemoryVT(), MemSD->getMemOperand()); 3078 3079 // return DCI.CombineTo(N, NewSt, true); 3080 return NewSt; 3081 } 3082 3083 return SDValue(); 3084 } 3085 3086 // st i1 v, addr 3087 // => 3088 // v1 = zxt v to i16 3089 // st.u8 i16, addr 3090 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { 3091 SDNode *Node = Op.getNode(); 3092 SDLoc dl(Node); 3093 StoreSDNode *ST = cast<StoreSDNode>(Node); 3094 SDValue Tmp1 = ST->getChain(); 3095 SDValue Tmp2 = ST->getBasePtr(); 3096 SDValue Tmp3 = ST->getValue(); 3097 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only"); 3098 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); 3099 SDValue Result = 3100 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, 3101 ST->getAlign(), ST->getMemOperand()->getFlags()); 3102 return Result; 3103 } 3104 3105 SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op, 3106 SelectionDAG &DAG) const { 3107 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit 3108 // operand so that it can pass the legalization. 3109 3110 assert(Op.getOperand(1).getValueType() == MVT::i128 && 3111 "Custom lowering for 128-bit CopyToReg only"); 3112 3113 SDNode *Node = Op.getNode(); 3114 SDLoc DL(Node); 3115 3116 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2)); 3117 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, 3118 DAG.getIntPtrConstant(0, DL)); 3119 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast, 3120 DAG.getIntPtrConstant(1, DL)); 3121 3122 SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1); 3123 SmallVector<EVT, 3> ResultsType(Node->values()); 3124 3125 NewOps[0] = Op->getOperand(0); // Chain 3126 NewOps[1] = Op->getOperand(1); // Dst Reg 3127 NewOps[2] = Lo; // Lower 64-bit 3128 NewOps[3] = Hi; // Higher 64-bit 3129 if (Op.getNumOperands() == 4) 3130 NewOps[4] = Op->getOperand(3); // Glue if exists 3131 3132 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps); 3133 } 3134 3135 unsigned NVPTXTargetLowering::getNumRegisters( 3136 LLVMContext &Context, EVT VT, 3137 std::optional<MVT> RegisterVT = std::nullopt) const { 3138 if (VT == MVT::i128 && RegisterVT == MVT::i128) 3139 return 1; 3140 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT); 3141 } 3142 3143 bool NVPTXTargetLowering::splitValueIntoRegisterParts( 3144 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 3145 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { 3146 if (Val.getValueType() == MVT::i128 && NumParts == 1) { 3147 Parts[0] = Val; 3148 return true; 3149 } 3150 return false; 3151 } 3152 3153 // This creates target external symbol for a function parameter. 3154 // Name of the symbol is composed from its index and the function name. 3155 // Negative index corresponds to special parameter (unsized array) used for 3156 // passing variable arguments. 3157 SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, 3158 EVT v) const { 3159 StringRef SavedStr = nvTM->getStrPool().save( 3160 getParamName(&DAG.getMachineFunction().getFunction(), idx)); 3161 return DAG.getTargetExternalSymbol(SavedStr.data(), v); 3162 } 3163 3164 SDValue NVPTXTargetLowering::LowerFormalArguments( 3165 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 3166 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 3167 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 3168 MachineFunction &MF = DAG.getMachineFunction(); 3169 const DataLayout &DL = DAG.getDataLayout(); 3170 auto PtrVT = getPointerTy(DAG.getDataLayout()); 3171 3172 const Function *F = &MF.getFunction(); 3173 const AttributeList &PAL = F->getAttributes(); 3174 const TargetLowering *TLI = STI.getTargetLowering(); 3175 3176 SDValue Root = DAG.getRoot(); 3177 std::vector<SDValue> OutChains; 3178 3179 bool isABI = (STI.getSmVersion() >= 20); 3180 assert(isABI && "Non-ABI compilation is not supported"); 3181 if (!isABI) 3182 return Chain; 3183 3184 std::vector<Type *> argTypes; 3185 std::vector<const Argument *> theArgs; 3186 for (const Argument &I : F->args()) { 3187 theArgs.push_back(&I); 3188 argTypes.push_back(I.getType()); 3189 } 3190 // argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3191 // Ins.size() will be larger 3192 // * if there is an aggregate argument with multiple fields (each field 3193 // showing up separately in Ins) 3194 // * if there is a vector argument with more than typical vector-length 3195 // elements (generally if more than 4) where each vector element is 3196 // individually present in Ins. 3197 // So a different index should be used for indexing into Ins. 3198 // See similar issue in LowerCall. 3199 unsigned InsIdx = 0; 3200 3201 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) { 3202 Type *Ty = argTypes[i]; 3203 3204 if (theArgs[i]->use_empty()) { 3205 // argument is dead 3206 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { 3207 SmallVector<EVT, 16> vtparts; 3208 3209 ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts); 3210 if (vtparts.empty()) 3211 report_fatal_error("Empty parameter types are not supported"); 3212 3213 for (unsigned parti = 0, parte = vtparts.size(); parti != parte; 3214 ++parti) { 3215 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3216 ++InsIdx; 3217 } 3218 if (vtparts.size() > 0) 3219 --InsIdx; 3220 continue; 3221 } 3222 if (Ty->isVectorTy()) { 3223 EVT ObjectVT = getValueType(DL, Ty); 3224 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); 3225 for (unsigned parti = 0; parti < NumRegs; ++parti) { 3226 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3227 ++InsIdx; 3228 } 3229 if (NumRegs > 0) 3230 --InsIdx; 3231 continue; 3232 } 3233 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); 3234 continue; 3235 } 3236 3237 // In the following cases, assign a node order of "i+1" 3238 // to newly created nodes. The SDNodes for params have to 3239 // appear in the same order as their order of appearance 3240 // in the original function. "i+1" holds that order. 3241 if (!PAL.hasParamAttr(i, Attribute::ByVal)) { 3242 bool aggregateIsPacked = false; 3243 if (StructType *STy = dyn_cast<StructType>(Ty)) 3244 aggregateIsPacked = STy->isPacked(); 3245 3246 SmallVector<EVT, 16> VTs; 3247 SmallVector<uint64_t, 16> Offsets; 3248 ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0); 3249 if (VTs.empty()) 3250 report_fatal_error("Empty parameter types are not supported"); 3251 3252 Align ArgAlign = getFunctionArgumentAlignment( 3253 F, Ty, i + AttributeList::FirstArgIndex, DL); 3254 auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign); 3255 3256 SDValue Arg = getParamSymbol(DAG, i, PtrVT); 3257 int VecIdx = -1; // Index of the first element of the current vector. 3258 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { 3259 if (VectorInfo[parti] & PVF_FIRST) { 3260 assert(VecIdx == -1 && "Orphaned vector."); 3261 VecIdx = parti; 3262 } 3263 3264 // That's the last element of this store op. 3265 if (VectorInfo[parti] & PVF_LAST) { 3266 unsigned NumElts = parti - VecIdx + 1; 3267 EVT EltVT = VTs[parti]; 3268 // i1 is loaded/stored as i8. 3269 EVT LoadVT = EltVT; 3270 if (EltVT == MVT::i1) 3271 LoadVT = MVT::i8; 3272 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) 3273 // getLoad needs a vector type, but it can't handle 3274 // vectors which contain v2f16 or v2bf16 elements. So we must load 3275 // using i32 here and then bitcast back. 3276 LoadVT = MVT::i32; 3277 3278 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); 3279 SDValue VecAddr = 3280 DAG.getNode(ISD::ADD, dl, PtrVT, Arg, 3281 DAG.getConstant(Offsets[VecIdx], dl, PtrVT)); 3282 Value *srcValue = Constant::getNullValue(PointerType::get( 3283 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); 3284 3285 const MaybeAlign PartAlign = [&]() -> MaybeAlign { 3286 if (aggregateIsPacked) 3287 return Align(1); 3288 if (NumElts != 1) 3289 return std::nullopt; 3290 Align PartAlign = 3291 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); 3292 return commonAlignment(PartAlign, Offsets[parti]); 3293 }(); 3294 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr, 3295 MachinePointerInfo(srcValue), PartAlign, 3296 MachineMemOperand::MODereferenceable | 3297 MachineMemOperand::MOInvariant); 3298 if (P.getNode()) 3299 P.getNode()->setIROrder(i + 1); 3300 for (unsigned j = 0; j < NumElts; ++j) { 3301 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P, 3302 DAG.getIntPtrConstant(j, dl)); 3303 // We've loaded i1 as an i8 and now must truncate it back to i1 3304 if (EltVT == MVT::i1) 3305 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); 3306 // v2f16 was loaded as an i32. Now we must bitcast it back. 3307 else if (EltVT != LoadVT) 3308 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt); 3309 3310 // If a promoted integer type is used, truncate down to the original 3311 MVT PromotedVT; 3312 if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) { 3313 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); 3314 } 3315 3316 // Extend the element if necessary (e.g. an i8 is loaded 3317 // into an i16 register) 3318 if (Ins[InsIdx].VT.isInteger() && 3319 Ins[InsIdx].VT.getFixedSizeInBits() > 3320 LoadVT.getFixedSizeInBits()) { 3321 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND 3322 : ISD::ZERO_EXTEND; 3323 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); 3324 } 3325 InVals.push_back(Elt); 3326 } 3327 3328 // Reset vector tracking state. 3329 VecIdx = -1; 3330 } 3331 ++InsIdx; 3332 } 3333 if (VTs.size() > 0) 3334 --InsIdx; 3335 continue; 3336 } 3337 3338 // Param has ByVal attribute 3339 // Return MoveParam(param symbol). 3340 // Ideally, the param symbol can be returned directly, 3341 // but when SDNode builder decides to use it in a CopyToReg(), 3342 // machine instruction fails because TargetExternalSymbol 3343 // (not lowered) is target dependent, and CopyToReg assumes 3344 // the source is lowered. 3345 EVT ObjectVT = getValueType(DL, Ty); 3346 assert(ObjectVT == Ins[InsIdx].VT && 3347 "Ins type did not match function type"); 3348 SDValue Arg = getParamSymbol(DAG, i, PtrVT); 3349 SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); 3350 if (p.getNode()) 3351 p.getNode()->setIROrder(i + 1); 3352 InVals.push_back(p); 3353 } 3354 3355 if (!OutChains.empty()) 3356 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); 3357 3358 return Chain; 3359 } 3360 3361 // Use byte-store when the param adress of the return value is unaligned. 3362 // This may happen when the return value is a field of a packed structure. 3363 static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, 3364 uint64_t Offset, EVT ElementType, 3365 SDValue RetVal, const SDLoc &dl) { 3366 // Bit logic only works on integer types 3367 if (adjustElementType(ElementType)) 3368 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); 3369 3370 // Store each byte 3371 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { 3372 // Shift the byte to the last byte position 3373 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, 3374 DAG.getConstant(i * 8, dl, MVT::i32)); 3375 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), 3376 ShiftVal}; 3377 // Trunc store only the last byte by using 3378 // st.param.b8 3379 // The register type can be larger than b8. 3380 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, 3381 DAG.getVTList(MVT::Other), StoreOperands, 3382 MVT::i8, MachinePointerInfo(), std::nullopt, 3383 MachineMemOperand::MOStore); 3384 } 3385 return Chain; 3386 } 3387 3388 SDValue 3389 NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 3390 bool isVarArg, 3391 const SmallVectorImpl<ISD::OutputArg> &Outs, 3392 const SmallVectorImpl<SDValue> &OutVals, 3393 const SDLoc &dl, SelectionDAG &DAG) const { 3394 const MachineFunction &MF = DAG.getMachineFunction(); 3395 const Function &F = MF.getFunction(); 3396 Type *RetTy = MF.getFunction().getReturnType(); 3397 3398 bool isABI = (STI.getSmVersion() >= 20); 3399 assert(isABI && "Non-ABI compilation is not supported"); 3400 if (!isABI) 3401 return Chain; 3402 3403 const DataLayout &DL = DAG.getDataLayout(); 3404 SmallVector<SDValue, 16> PromotedOutVals; 3405 SmallVector<EVT, 16> VTs; 3406 SmallVector<uint64_t, 16> Offsets; 3407 ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets); 3408 assert(VTs.size() == OutVals.size() && "Bad return value decomposition"); 3409 3410 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3411 SDValue PromotedOutVal = OutVals[i]; 3412 MVT PromotedVT; 3413 if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) { 3414 VTs[i] = EVT(PromotedVT); 3415 } 3416 if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) { 3417 llvm::ISD::NodeType Ext = 3418 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 3419 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal); 3420 } 3421 PromotedOutVals.push_back(PromotedOutVal); 3422 } 3423 3424 auto VectorInfo = VectorizePTXValueVTs( 3425 VTs, Offsets, 3426 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) 3427 : Align(1)); 3428 3429 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3430 // 32-bits are sign extended or zero extended, depending on whether 3431 // they are signed or unsigned types. 3432 bool ExtendIntegerRetVal = 3433 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; 3434 3435 SmallVector<SDValue, 6> StoreOperands; 3436 for (unsigned i = 0, e = VTs.size(); i != e; ++i) { 3437 SDValue OutVal = OutVals[i]; 3438 SDValue RetVal = PromotedOutVals[i]; 3439 3440 if (ExtendIntegerRetVal) { 3441 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND 3442 : ISD::ZERO_EXTEND, 3443 dl, MVT::i32, RetVal); 3444 } else if (OutVal.getValueSizeInBits() < 16) { 3445 // Use 16-bit registers for small load-stores as it's the 3446 // smallest general purpose register size supported by NVPTX. 3447 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); 3448 } 3449 3450 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned 3451 // for a scalar store. In such cases, fall back to byte stores. 3452 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { 3453 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3454 Align ElementTypeAlign = 3455 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); 3456 Align ElementAlign = 3457 commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]); 3458 if (ElementAlign < ElementTypeAlign) { 3459 assert(StoreOperands.empty() && "Orphaned operand list."); 3460 Chain = LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType, 3461 RetVal, dl); 3462 3463 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes 3464 // into the graph, so just move on to the next element. 3465 continue; 3466 } 3467 } 3468 3469 // New load/store. Record chain and offset operands. 3470 if (VectorInfo[i] & PVF_FIRST) { 3471 assert(StoreOperands.empty() && "Orphaned operand list."); 3472 StoreOperands.push_back(Chain); 3473 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); 3474 } 3475 3476 // Record the value to return. 3477 StoreOperands.push_back(RetVal); 3478 3479 // That's the last element of this store op. 3480 if (VectorInfo[i] & PVF_LAST) { 3481 NVPTXISD::NodeType Op; 3482 unsigned NumElts = StoreOperands.size() - 2; 3483 switch (NumElts) { 3484 case 1: 3485 Op = NVPTXISD::StoreRetval; 3486 break; 3487 case 2: 3488 Op = NVPTXISD::StoreRetvalV2; 3489 break; 3490 case 4: 3491 Op = NVPTXISD::StoreRetvalV4; 3492 break; 3493 default: 3494 llvm_unreachable("Invalid vector info."); 3495 } 3496 3497 // Adjust type of load/store op if we've extended the scalar 3498 // return value. 3499 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; 3500 Chain = DAG.getMemIntrinsicNode( 3501 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, 3502 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); 3503 // Cleanup vector state. 3504 StoreOperands.clear(); 3505 } 3506 } 3507 3508 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); 3509 } 3510 3511 void NVPTXTargetLowering::LowerAsmOperandForConstraint( 3512 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, 3513 SelectionDAG &DAG) const { 3514 if (Constraint.size() > 1) 3515 return; 3516 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 3517 } 3518 3519 static unsigned getOpcForTextureInstr(unsigned Intrinsic) { 3520 switch (Intrinsic) { 3521 default: 3522 return 0; 3523 3524 case Intrinsic::nvvm_tex_1d_v4f32_s32: 3525 return NVPTXISD::Tex1DFloatS32; 3526 case Intrinsic::nvvm_tex_1d_v4f32_f32: 3527 return NVPTXISD::Tex1DFloatFloat; 3528 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 3529 return NVPTXISD::Tex1DFloatFloatLevel; 3530 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 3531 return NVPTXISD::Tex1DFloatFloatGrad; 3532 case Intrinsic::nvvm_tex_1d_v4s32_s32: 3533 return NVPTXISD::Tex1DS32S32; 3534 case Intrinsic::nvvm_tex_1d_v4s32_f32: 3535 return NVPTXISD::Tex1DS32Float; 3536 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 3537 return NVPTXISD::Tex1DS32FloatLevel; 3538 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 3539 return NVPTXISD::Tex1DS32FloatGrad; 3540 case Intrinsic::nvvm_tex_1d_v4u32_s32: 3541 return NVPTXISD::Tex1DU32S32; 3542 case Intrinsic::nvvm_tex_1d_v4u32_f32: 3543 return NVPTXISD::Tex1DU32Float; 3544 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 3545 return NVPTXISD::Tex1DU32FloatLevel; 3546 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 3547 return NVPTXISD::Tex1DU32FloatGrad; 3548 3549 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 3550 return NVPTXISD::Tex1DArrayFloatS32; 3551 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 3552 return NVPTXISD::Tex1DArrayFloatFloat; 3553 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 3554 return NVPTXISD::Tex1DArrayFloatFloatLevel; 3555 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 3556 return NVPTXISD::Tex1DArrayFloatFloatGrad; 3557 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 3558 return NVPTXISD::Tex1DArrayS32S32; 3559 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 3560 return NVPTXISD::Tex1DArrayS32Float; 3561 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 3562 return NVPTXISD::Tex1DArrayS32FloatLevel; 3563 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 3564 return NVPTXISD::Tex1DArrayS32FloatGrad; 3565 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 3566 return NVPTXISD::Tex1DArrayU32S32; 3567 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 3568 return NVPTXISD::Tex1DArrayU32Float; 3569 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 3570 return NVPTXISD::Tex1DArrayU32FloatLevel; 3571 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 3572 return NVPTXISD::Tex1DArrayU32FloatGrad; 3573 3574 case Intrinsic::nvvm_tex_2d_v4f32_s32: 3575 return NVPTXISD::Tex2DFloatS32; 3576 case Intrinsic::nvvm_tex_2d_v4f32_f32: 3577 return NVPTXISD::Tex2DFloatFloat; 3578 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 3579 return NVPTXISD::Tex2DFloatFloatLevel; 3580 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 3581 return NVPTXISD::Tex2DFloatFloatGrad; 3582 case Intrinsic::nvvm_tex_2d_v4s32_s32: 3583 return NVPTXISD::Tex2DS32S32; 3584 case Intrinsic::nvvm_tex_2d_v4s32_f32: 3585 return NVPTXISD::Tex2DS32Float; 3586 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 3587 return NVPTXISD::Tex2DS32FloatLevel; 3588 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 3589 return NVPTXISD::Tex2DS32FloatGrad; 3590 case Intrinsic::nvvm_tex_2d_v4u32_s32: 3591 return NVPTXISD::Tex2DU32S32; 3592 case Intrinsic::nvvm_tex_2d_v4u32_f32: 3593 return NVPTXISD::Tex2DU32Float; 3594 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 3595 return NVPTXISD::Tex2DU32FloatLevel; 3596 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 3597 return NVPTXISD::Tex2DU32FloatGrad; 3598 3599 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 3600 return NVPTXISD::Tex2DArrayFloatS32; 3601 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 3602 return NVPTXISD::Tex2DArrayFloatFloat; 3603 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 3604 return NVPTXISD::Tex2DArrayFloatFloatLevel; 3605 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 3606 return NVPTXISD::Tex2DArrayFloatFloatGrad; 3607 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 3608 return NVPTXISD::Tex2DArrayS32S32; 3609 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 3610 return NVPTXISD::Tex2DArrayS32Float; 3611 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 3612 return NVPTXISD::Tex2DArrayS32FloatLevel; 3613 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 3614 return NVPTXISD::Tex2DArrayS32FloatGrad; 3615 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 3616 return NVPTXISD::Tex2DArrayU32S32; 3617 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 3618 return NVPTXISD::Tex2DArrayU32Float; 3619 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 3620 return NVPTXISD::Tex2DArrayU32FloatLevel; 3621 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 3622 return NVPTXISD::Tex2DArrayU32FloatGrad; 3623 3624 case Intrinsic::nvvm_tex_3d_v4f32_s32: 3625 return NVPTXISD::Tex3DFloatS32; 3626 case Intrinsic::nvvm_tex_3d_v4f32_f32: 3627 return NVPTXISD::Tex3DFloatFloat; 3628 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 3629 return NVPTXISD::Tex3DFloatFloatLevel; 3630 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 3631 return NVPTXISD::Tex3DFloatFloatGrad; 3632 case Intrinsic::nvvm_tex_3d_v4s32_s32: 3633 return NVPTXISD::Tex3DS32S32; 3634 case Intrinsic::nvvm_tex_3d_v4s32_f32: 3635 return NVPTXISD::Tex3DS32Float; 3636 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 3637 return NVPTXISD::Tex3DS32FloatLevel; 3638 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 3639 return NVPTXISD::Tex3DS32FloatGrad; 3640 case Intrinsic::nvvm_tex_3d_v4u32_s32: 3641 return NVPTXISD::Tex3DU32S32; 3642 case Intrinsic::nvvm_tex_3d_v4u32_f32: 3643 return NVPTXISD::Tex3DU32Float; 3644 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 3645 return NVPTXISD::Tex3DU32FloatLevel; 3646 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 3647 return NVPTXISD::Tex3DU32FloatGrad; 3648 3649 case Intrinsic::nvvm_tex_cube_v4f32_f32: 3650 return NVPTXISD::TexCubeFloatFloat; 3651 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 3652 return NVPTXISD::TexCubeFloatFloatLevel; 3653 case Intrinsic::nvvm_tex_cube_v4s32_f32: 3654 return NVPTXISD::TexCubeS32Float; 3655 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 3656 return NVPTXISD::TexCubeS32FloatLevel; 3657 case Intrinsic::nvvm_tex_cube_v4u32_f32: 3658 return NVPTXISD::TexCubeU32Float; 3659 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 3660 return NVPTXISD::TexCubeU32FloatLevel; 3661 3662 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 3663 return NVPTXISD::TexCubeArrayFloatFloat; 3664 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 3665 return NVPTXISD::TexCubeArrayFloatFloatLevel; 3666 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 3667 return NVPTXISD::TexCubeArrayS32Float; 3668 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 3669 return NVPTXISD::TexCubeArrayS32FloatLevel; 3670 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 3671 return NVPTXISD::TexCubeArrayU32Float; 3672 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 3673 return NVPTXISD::TexCubeArrayU32FloatLevel; 3674 3675 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 3676 return NVPTXISD::Tld4R2DFloatFloat; 3677 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 3678 return NVPTXISD::Tld4G2DFloatFloat; 3679 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 3680 return NVPTXISD::Tld4B2DFloatFloat; 3681 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 3682 return NVPTXISD::Tld4A2DFloatFloat; 3683 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 3684 return NVPTXISD::Tld4R2DS64Float; 3685 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 3686 return NVPTXISD::Tld4G2DS64Float; 3687 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 3688 return NVPTXISD::Tld4B2DS64Float; 3689 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 3690 return NVPTXISD::Tld4A2DS64Float; 3691 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 3692 return NVPTXISD::Tld4R2DU64Float; 3693 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 3694 return NVPTXISD::Tld4G2DU64Float; 3695 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 3696 return NVPTXISD::Tld4B2DU64Float; 3697 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 3698 return NVPTXISD::Tld4A2DU64Float; 3699 3700 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 3701 return NVPTXISD::TexUnified1DFloatS32; 3702 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 3703 return NVPTXISD::TexUnified1DFloatFloat; 3704 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 3705 return NVPTXISD::TexUnified1DFloatFloatLevel; 3706 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 3707 return NVPTXISD::TexUnified1DFloatFloatGrad; 3708 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 3709 return NVPTXISD::TexUnified1DS32S32; 3710 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 3711 return NVPTXISD::TexUnified1DS32Float; 3712 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 3713 return NVPTXISD::TexUnified1DS32FloatLevel; 3714 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 3715 return NVPTXISD::TexUnified1DS32FloatGrad; 3716 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 3717 return NVPTXISD::TexUnified1DU32S32; 3718 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 3719 return NVPTXISD::TexUnified1DU32Float; 3720 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 3721 return NVPTXISD::TexUnified1DU32FloatLevel; 3722 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 3723 return NVPTXISD::TexUnified1DU32FloatGrad; 3724 3725 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 3726 return NVPTXISD::TexUnified1DArrayFloatS32; 3727 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 3728 return NVPTXISD::TexUnified1DArrayFloatFloat; 3729 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 3730 return NVPTXISD::TexUnified1DArrayFloatFloatLevel; 3731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 3732 return NVPTXISD::TexUnified1DArrayFloatFloatGrad; 3733 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 3734 return NVPTXISD::TexUnified1DArrayS32S32; 3735 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 3736 return NVPTXISD::TexUnified1DArrayS32Float; 3737 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 3738 return NVPTXISD::TexUnified1DArrayS32FloatLevel; 3739 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 3740 return NVPTXISD::TexUnified1DArrayS32FloatGrad; 3741 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 3742 return NVPTXISD::TexUnified1DArrayU32S32; 3743 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 3744 return NVPTXISD::TexUnified1DArrayU32Float; 3745 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 3746 return NVPTXISD::TexUnified1DArrayU32FloatLevel; 3747 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 3748 return NVPTXISD::TexUnified1DArrayU32FloatGrad; 3749 3750 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 3751 return NVPTXISD::TexUnified2DFloatS32; 3752 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 3753 return NVPTXISD::TexUnified2DFloatFloat; 3754 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 3755 return NVPTXISD::TexUnified2DFloatFloatLevel; 3756 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 3757 return NVPTXISD::TexUnified2DFloatFloatGrad; 3758 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 3759 return NVPTXISD::TexUnified2DS32S32; 3760 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 3761 return NVPTXISD::TexUnified2DS32Float; 3762 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 3763 return NVPTXISD::TexUnified2DS32FloatLevel; 3764 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 3765 return NVPTXISD::TexUnified2DS32FloatGrad; 3766 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 3767 return NVPTXISD::TexUnified2DU32S32; 3768 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 3769 return NVPTXISD::TexUnified2DU32Float; 3770 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 3771 return NVPTXISD::TexUnified2DU32FloatLevel; 3772 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 3773 return NVPTXISD::TexUnified2DU32FloatGrad; 3774 3775 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 3776 return NVPTXISD::TexUnified2DArrayFloatS32; 3777 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 3778 return NVPTXISD::TexUnified2DArrayFloatFloat; 3779 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 3780 return NVPTXISD::TexUnified2DArrayFloatFloatLevel; 3781 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 3782 return NVPTXISD::TexUnified2DArrayFloatFloatGrad; 3783 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 3784 return NVPTXISD::TexUnified2DArrayS32S32; 3785 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 3786 return NVPTXISD::TexUnified2DArrayS32Float; 3787 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 3788 return NVPTXISD::TexUnified2DArrayS32FloatLevel; 3789 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 3790 return NVPTXISD::TexUnified2DArrayS32FloatGrad; 3791 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 3792 return NVPTXISD::TexUnified2DArrayU32S32; 3793 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 3794 return NVPTXISD::TexUnified2DArrayU32Float; 3795 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 3796 return NVPTXISD::TexUnified2DArrayU32FloatLevel; 3797 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 3798 return NVPTXISD::TexUnified2DArrayU32FloatGrad; 3799 3800 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 3801 return NVPTXISD::TexUnified3DFloatS32; 3802 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 3803 return NVPTXISD::TexUnified3DFloatFloat; 3804 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 3805 return NVPTXISD::TexUnified3DFloatFloatLevel; 3806 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 3807 return NVPTXISD::TexUnified3DFloatFloatGrad; 3808 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 3809 return NVPTXISD::TexUnified3DS32S32; 3810 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 3811 return NVPTXISD::TexUnified3DS32Float; 3812 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 3813 return NVPTXISD::TexUnified3DS32FloatLevel; 3814 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 3815 return NVPTXISD::TexUnified3DS32FloatGrad; 3816 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 3817 return NVPTXISD::TexUnified3DU32S32; 3818 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 3819 return NVPTXISD::TexUnified3DU32Float; 3820 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 3821 return NVPTXISD::TexUnified3DU32FloatLevel; 3822 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 3823 return NVPTXISD::TexUnified3DU32FloatGrad; 3824 3825 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 3826 return NVPTXISD::TexUnifiedCubeFloatFloat; 3827 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 3828 return NVPTXISD::TexUnifiedCubeFloatFloatLevel; 3829 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 3830 return NVPTXISD::TexUnifiedCubeS32Float; 3831 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 3832 return NVPTXISD::TexUnifiedCubeS32FloatLevel; 3833 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 3834 return NVPTXISD::TexUnifiedCubeU32Float; 3835 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 3836 return NVPTXISD::TexUnifiedCubeU32FloatLevel; 3837 3838 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 3839 return NVPTXISD::TexUnifiedCubeArrayFloatFloat; 3840 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 3841 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; 3842 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 3843 return NVPTXISD::TexUnifiedCubeArrayS32Float; 3844 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 3845 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; 3846 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 3847 return NVPTXISD::TexUnifiedCubeArrayU32Float; 3848 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 3849 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; 3850 3851 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 3852 return NVPTXISD::TexUnifiedCubeFloatFloatGrad; 3853 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 3854 return NVPTXISD::TexUnifiedCubeS32FloatGrad; 3855 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 3856 return NVPTXISD::TexUnifiedCubeU32FloatGrad; 3857 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 3858 return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad; 3859 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 3860 return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad; 3861 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 3862 return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad; 3863 3864 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 3865 return NVPTXISD::Tld4UnifiedR2DFloatFloat; 3866 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 3867 return NVPTXISD::Tld4UnifiedG2DFloatFloat; 3868 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 3869 return NVPTXISD::Tld4UnifiedB2DFloatFloat; 3870 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 3871 return NVPTXISD::Tld4UnifiedA2DFloatFloat; 3872 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 3873 return NVPTXISD::Tld4UnifiedR2DS64Float; 3874 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 3875 return NVPTXISD::Tld4UnifiedG2DS64Float; 3876 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 3877 return NVPTXISD::Tld4UnifiedB2DS64Float; 3878 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 3879 return NVPTXISD::Tld4UnifiedA2DS64Float; 3880 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 3881 return NVPTXISD::Tld4UnifiedR2DU64Float; 3882 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 3883 return NVPTXISD::Tld4UnifiedG2DU64Float; 3884 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 3885 return NVPTXISD::Tld4UnifiedB2DU64Float; 3886 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 3887 return NVPTXISD::Tld4UnifiedA2DU64Float; 3888 } 3889 } 3890 3891 static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { 3892 switch (Intrinsic) { 3893 default: 3894 return 0; 3895 case Intrinsic::nvvm_suld_1d_i8_clamp: 3896 return NVPTXISD::Suld1DI8Clamp; 3897 case Intrinsic::nvvm_suld_1d_i16_clamp: 3898 return NVPTXISD::Suld1DI16Clamp; 3899 case Intrinsic::nvvm_suld_1d_i32_clamp: 3900 return NVPTXISD::Suld1DI32Clamp; 3901 case Intrinsic::nvvm_suld_1d_i64_clamp: 3902 return NVPTXISD::Suld1DI64Clamp; 3903 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 3904 return NVPTXISD::Suld1DV2I8Clamp; 3905 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 3906 return NVPTXISD::Suld1DV2I16Clamp; 3907 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 3908 return NVPTXISD::Suld1DV2I32Clamp; 3909 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 3910 return NVPTXISD::Suld1DV2I64Clamp; 3911 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 3912 return NVPTXISD::Suld1DV4I8Clamp; 3913 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 3914 return NVPTXISD::Suld1DV4I16Clamp; 3915 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 3916 return NVPTXISD::Suld1DV4I32Clamp; 3917 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 3918 return NVPTXISD::Suld1DArrayI8Clamp; 3919 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 3920 return NVPTXISD::Suld1DArrayI16Clamp; 3921 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 3922 return NVPTXISD::Suld1DArrayI32Clamp; 3923 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 3924 return NVPTXISD::Suld1DArrayI64Clamp; 3925 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 3926 return NVPTXISD::Suld1DArrayV2I8Clamp; 3927 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 3928 return NVPTXISD::Suld1DArrayV2I16Clamp; 3929 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 3930 return NVPTXISD::Suld1DArrayV2I32Clamp; 3931 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 3932 return NVPTXISD::Suld1DArrayV2I64Clamp; 3933 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 3934 return NVPTXISD::Suld1DArrayV4I8Clamp; 3935 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 3936 return NVPTXISD::Suld1DArrayV4I16Clamp; 3937 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 3938 return NVPTXISD::Suld1DArrayV4I32Clamp; 3939 case Intrinsic::nvvm_suld_2d_i8_clamp: 3940 return NVPTXISD::Suld2DI8Clamp; 3941 case Intrinsic::nvvm_suld_2d_i16_clamp: 3942 return NVPTXISD::Suld2DI16Clamp; 3943 case Intrinsic::nvvm_suld_2d_i32_clamp: 3944 return NVPTXISD::Suld2DI32Clamp; 3945 case Intrinsic::nvvm_suld_2d_i64_clamp: 3946 return NVPTXISD::Suld2DI64Clamp; 3947 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 3948 return NVPTXISD::Suld2DV2I8Clamp; 3949 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 3950 return NVPTXISD::Suld2DV2I16Clamp; 3951 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 3952 return NVPTXISD::Suld2DV2I32Clamp; 3953 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 3954 return NVPTXISD::Suld2DV2I64Clamp; 3955 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 3956 return NVPTXISD::Suld2DV4I8Clamp; 3957 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 3958 return NVPTXISD::Suld2DV4I16Clamp; 3959 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 3960 return NVPTXISD::Suld2DV4I32Clamp; 3961 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 3962 return NVPTXISD::Suld2DArrayI8Clamp; 3963 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 3964 return NVPTXISD::Suld2DArrayI16Clamp; 3965 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 3966 return NVPTXISD::Suld2DArrayI32Clamp; 3967 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 3968 return NVPTXISD::Suld2DArrayI64Clamp; 3969 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 3970 return NVPTXISD::Suld2DArrayV2I8Clamp; 3971 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 3972 return NVPTXISD::Suld2DArrayV2I16Clamp; 3973 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 3974 return NVPTXISD::Suld2DArrayV2I32Clamp; 3975 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 3976 return NVPTXISD::Suld2DArrayV2I64Clamp; 3977 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 3978 return NVPTXISD::Suld2DArrayV4I8Clamp; 3979 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 3980 return NVPTXISD::Suld2DArrayV4I16Clamp; 3981 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 3982 return NVPTXISD::Suld2DArrayV4I32Clamp; 3983 case Intrinsic::nvvm_suld_3d_i8_clamp: 3984 return NVPTXISD::Suld3DI8Clamp; 3985 case Intrinsic::nvvm_suld_3d_i16_clamp: 3986 return NVPTXISD::Suld3DI16Clamp; 3987 case Intrinsic::nvvm_suld_3d_i32_clamp: 3988 return NVPTXISD::Suld3DI32Clamp; 3989 case Intrinsic::nvvm_suld_3d_i64_clamp: 3990 return NVPTXISD::Suld3DI64Clamp; 3991 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 3992 return NVPTXISD::Suld3DV2I8Clamp; 3993 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 3994 return NVPTXISD::Suld3DV2I16Clamp; 3995 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 3996 return NVPTXISD::Suld3DV2I32Clamp; 3997 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 3998 return NVPTXISD::Suld3DV2I64Clamp; 3999 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4000 return NVPTXISD::Suld3DV4I8Clamp; 4001 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4002 return NVPTXISD::Suld3DV4I16Clamp; 4003 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 4004 return NVPTXISD::Suld3DV4I32Clamp; 4005 case Intrinsic::nvvm_suld_1d_i8_trap: 4006 return NVPTXISD::Suld1DI8Trap; 4007 case Intrinsic::nvvm_suld_1d_i16_trap: 4008 return NVPTXISD::Suld1DI16Trap; 4009 case Intrinsic::nvvm_suld_1d_i32_trap: 4010 return NVPTXISD::Suld1DI32Trap; 4011 case Intrinsic::nvvm_suld_1d_i64_trap: 4012 return NVPTXISD::Suld1DI64Trap; 4013 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4014 return NVPTXISD::Suld1DV2I8Trap; 4015 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4016 return NVPTXISD::Suld1DV2I16Trap; 4017 case Intrinsic::nvvm_suld_1d_v2i32_trap: 4018 return NVPTXISD::Suld1DV2I32Trap; 4019 case Intrinsic::nvvm_suld_1d_v2i64_trap: 4020 return NVPTXISD::Suld1DV2I64Trap; 4021 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4022 return NVPTXISD::Suld1DV4I8Trap; 4023 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4024 return NVPTXISD::Suld1DV4I16Trap; 4025 case Intrinsic::nvvm_suld_1d_v4i32_trap: 4026 return NVPTXISD::Suld1DV4I32Trap; 4027 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4028 return NVPTXISD::Suld1DArrayI8Trap; 4029 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4030 return NVPTXISD::Suld1DArrayI16Trap; 4031 case Intrinsic::nvvm_suld_1d_array_i32_trap: 4032 return NVPTXISD::Suld1DArrayI32Trap; 4033 case Intrinsic::nvvm_suld_1d_array_i64_trap: 4034 return NVPTXISD::Suld1DArrayI64Trap; 4035 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4036 return NVPTXISD::Suld1DArrayV2I8Trap; 4037 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4038 return NVPTXISD::Suld1DArrayV2I16Trap; 4039 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 4040 return NVPTXISD::Suld1DArrayV2I32Trap; 4041 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 4042 return NVPTXISD::Suld1DArrayV2I64Trap; 4043 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4044 return NVPTXISD::Suld1DArrayV4I8Trap; 4045 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4046 return NVPTXISD::Suld1DArrayV4I16Trap; 4047 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 4048 return NVPTXISD::Suld1DArrayV4I32Trap; 4049 case Intrinsic::nvvm_suld_2d_i8_trap: 4050 return NVPTXISD::Suld2DI8Trap; 4051 case Intrinsic::nvvm_suld_2d_i16_trap: 4052 return NVPTXISD::Suld2DI16Trap; 4053 case Intrinsic::nvvm_suld_2d_i32_trap: 4054 return NVPTXISD::Suld2DI32Trap; 4055 case Intrinsic::nvvm_suld_2d_i64_trap: 4056 return NVPTXISD::Suld2DI64Trap; 4057 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4058 return NVPTXISD::Suld2DV2I8Trap; 4059 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4060 return NVPTXISD::Suld2DV2I16Trap; 4061 case Intrinsic::nvvm_suld_2d_v2i32_trap: 4062 return NVPTXISD::Suld2DV2I32Trap; 4063 case Intrinsic::nvvm_suld_2d_v2i64_trap: 4064 return NVPTXISD::Suld2DV2I64Trap; 4065 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4066 return NVPTXISD::Suld2DV4I8Trap; 4067 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4068 return NVPTXISD::Suld2DV4I16Trap; 4069 case Intrinsic::nvvm_suld_2d_v4i32_trap: 4070 return NVPTXISD::Suld2DV4I32Trap; 4071 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4072 return NVPTXISD::Suld2DArrayI8Trap; 4073 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4074 return NVPTXISD::Suld2DArrayI16Trap; 4075 case Intrinsic::nvvm_suld_2d_array_i32_trap: 4076 return NVPTXISD::Suld2DArrayI32Trap; 4077 case Intrinsic::nvvm_suld_2d_array_i64_trap: 4078 return NVPTXISD::Suld2DArrayI64Trap; 4079 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4080 return NVPTXISD::Suld2DArrayV2I8Trap; 4081 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4082 return NVPTXISD::Suld2DArrayV2I16Trap; 4083 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 4084 return NVPTXISD::Suld2DArrayV2I32Trap; 4085 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 4086 return NVPTXISD::Suld2DArrayV2I64Trap; 4087 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4088 return NVPTXISD::Suld2DArrayV4I8Trap; 4089 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4090 return NVPTXISD::Suld2DArrayV4I16Trap; 4091 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 4092 return NVPTXISD::Suld2DArrayV4I32Trap; 4093 case Intrinsic::nvvm_suld_3d_i8_trap: 4094 return NVPTXISD::Suld3DI8Trap; 4095 case Intrinsic::nvvm_suld_3d_i16_trap: 4096 return NVPTXISD::Suld3DI16Trap; 4097 case Intrinsic::nvvm_suld_3d_i32_trap: 4098 return NVPTXISD::Suld3DI32Trap; 4099 case Intrinsic::nvvm_suld_3d_i64_trap: 4100 return NVPTXISD::Suld3DI64Trap; 4101 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4102 return NVPTXISD::Suld3DV2I8Trap; 4103 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4104 return NVPTXISD::Suld3DV2I16Trap; 4105 case Intrinsic::nvvm_suld_3d_v2i32_trap: 4106 return NVPTXISD::Suld3DV2I32Trap; 4107 case Intrinsic::nvvm_suld_3d_v2i64_trap: 4108 return NVPTXISD::Suld3DV2I64Trap; 4109 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4110 return NVPTXISD::Suld3DV4I8Trap; 4111 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4112 return NVPTXISD::Suld3DV4I16Trap; 4113 case Intrinsic::nvvm_suld_3d_v4i32_trap: 4114 return NVPTXISD::Suld3DV4I32Trap; 4115 case Intrinsic::nvvm_suld_1d_i8_zero: 4116 return NVPTXISD::Suld1DI8Zero; 4117 case Intrinsic::nvvm_suld_1d_i16_zero: 4118 return NVPTXISD::Suld1DI16Zero; 4119 case Intrinsic::nvvm_suld_1d_i32_zero: 4120 return NVPTXISD::Suld1DI32Zero; 4121 case Intrinsic::nvvm_suld_1d_i64_zero: 4122 return NVPTXISD::Suld1DI64Zero; 4123 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4124 return NVPTXISD::Suld1DV2I8Zero; 4125 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4126 return NVPTXISD::Suld1DV2I16Zero; 4127 case Intrinsic::nvvm_suld_1d_v2i32_zero: 4128 return NVPTXISD::Suld1DV2I32Zero; 4129 case Intrinsic::nvvm_suld_1d_v2i64_zero: 4130 return NVPTXISD::Suld1DV2I64Zero; 4131 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4132 return NVPTXISD::Suld1DV4I8Zero; 4133 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4134 return NVPTXISD::Suld1DV4I16Zero; 4135 case Intrinsic::nvvm_suld_1d_v4i32_zero: 4136 return NVPTXISD::Suld1DV4I32Zero; 4137 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4138 return NVPTXISD::Suld1DArrayI8Zero; 4139 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4140 return NVPTXISD::Suld1DArrayI16Zero; 4141 case Intrinsic::nvvm_suld_1d_array_i32_zero: 4142 return NVPTXISD::Suld1DArrayI32Zero; 4143 case Intrinsic::nvvm_suld_1d_array_i64_zero: 4144 return NVPTXISD::Suld1DArrayI64Zero; 4145 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4146 return NVPTXISD::Suld1DArrayV2I8Zero; 4147 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4148 return NVPTXISD::Suld1DArrayV2I16Zero; 4149 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 4150 return NVPTXISD::Suld1DArrayV2I32Zero; 4151 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 4152 return NVPTXISD::Suld1DArrayV2I64Zero; 4153 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4154 return NVPTXISD::Suld1DArrayV4I8Zero; 4155 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4156 return NVPTXISD::Suld1DArrayV4I16Zero; 4157 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 4158 return NVPTXISD::Suld1DArrayV4I32Zero; 4159 case Intrinsic::nvvm_suld_2d_i8_zero: 4160 return NVPTXISD::Suld2DI8Zero; 4161 case Intrinsic::nvvm_suld_2d_i16_zero: 4162 return NVPTXISD::Suld2DI16Zero; 4163 case Intrinsic::nvvm_suld_2d_i32_zero: 4164 return NVPTXISD::Suld2DI32Zero; 4165 case Intrinsic::nvvm_suld_2d_i64_zero: 4166 return NVPTXISD::Suld2DI64Zero; 4167 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4168 return NVPTXISD::Suld2DV2I8Zero; 4169 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4170 return NVPTXISD::Suld2DV2I16Zero; 4171 case Intrinsic::nvvm_suld_2d_v2i32_zero: 4172 return NVPTXISD::Suld2DV2I32Zero; 4173 case Intrinsic::nvvm_suld_2d_v2i64_zero: 4174 return NVPTXISD::Suld2DV2I64Zero; 4175 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4176 return NVPTXISD::Suld2DV4I8Zero; 4177 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4178 return NVPTXISD::Suld2DV4I16Zero; 4179 case Intrinsic::nvvm_suld_2d_v4i32_zero: 4180 return NVPTXISD::Suld2DV4I32Zero; 4181 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4182 return NVPTXISD::Suld2DArrayI8Zero; 4183 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4184 return NVPTXISD::Suld2DArrayI16Zero; 4185 case Intrinsic::nvvm_suld_2d_array_i32_zero: 4186 return NVPTXISD::Suld2DArrayI32Zero; 4187 case Intrinsic::nvvm_suld_2d_array_i64_zero: 4188 return NVPTXISD::Suld2DArrayI64Zero; 4189 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4190 return NVPTXISD::Suld2DArrayV2I8Zero; 4191 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4192 return NVPTXISD::Suld2DArrayV2I16Zero; 4193 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 4194 return NVPTXISD::Suld2DArrayV2I32Zero; 4195 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 4196 return NVPTXISD::Suld2DArrayV2I64Zero; 4197 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4198 return NVPTXISD::Suld2DArrayV4I8Zero; 4199 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4200 return NVPTXISD::Suld2DArrayV4I16Zero; 4201 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 4202 return NVPTXISD::Suld2DArrayV4I32Zero; 4203 case Intrinsic::nvvm_suld_3d_i8_zero: 4204 return NVPTXISD::Suld3DI8Zero; 4205 case Intrinsic::nvvm_suld_3d_i16_zero: 4206 return NVPTXISD::Suld3DI16Zero; 4207 case Intrinsic::nvvm_suld_3d_i32_zero: 4208 return NVPTXISD::Suld3DI32Zero; 4209 case Intrinsic::nvvm_suld_3d_i64_zero: 4210 return NVPTXISD::Suld3DI64Zero; 4211 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4212 return NVPTXISD::Suld3DV2I8Zero; 4213 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4214 return NVPTXISD::Suld3DV2I16Zero; 4215 case Intrinsic::nvvm_suld_3d_v2i32_zero: 4216 return NVPTXISD::Suld3DV2I32Zero; 4217 case Intrinsic::nvvm_suld_3d_v2i64_zero: 4218 return NVPTXISD::Suld3DV2I64Zero; 4219 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4220 return NVPTXISD::Suld3DV4I8Zero; 4221 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4222 return NVPTXISD::Suld3DV4I16Zero; 4223 case Intrinsic::nvvm_suld_3d_v4i32_zero: 4224 return NVPTXISD::Suld3DV4I32Zero; 4225 } 4226 } 4227 4228 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 4229 // TgtMemIntrinsic 4230 // because we need the information that is only available in the "Value" type 4231 // of destination 4232 // pointer. In particular, the address space information. 4233 bool NVPTXTargetLowering::getTgtMemIntrinsic( 4234 IntrinsicInfo &Info, const CallInst &I, 4235 MachineFunction &MF, unsigned Intrinsic) const { 4236 switch (Intrinsic) { 4237 default: 4238 return false; 4239 case Intrinsic::nvvm_match_all_sync_i32p: 4240 case Intrinsic::nvvm_match_all_sync_i64p: 4241 Info.opc = ISD::INTRINSIC_W_CHAIN; 4242 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 4243 // in order to model data exchange with other threads, but perform no real 4244 // memory accesses. 4245 Info.memVT = MVT::i1; 4246 4247 // Our result depends on both our and other thread's arguments. 4248 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4249 return true; 4250 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: 4251 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: 4252 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: 4253 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: 4254 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: 4255 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: 4256 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: 4257 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: 4258 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: 4259 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: 4260 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: 4261 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: 4262 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: 4263 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: 4264 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: 4265 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: 4266 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: 4267 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: 4268 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: 4269 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: 4270 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: 4271 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: 4272 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: 4273 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { 4274 Info.opc = ISD::INTRINSIC_W_CHAIN; 4275 Info.memVT = MVT::v8f16; 4276 Info.ptrVal = I.getArgOperand(0); 4277 Info.offset = 0; 4278 Info.flags = MachineMemOperand::MOLoad; 4279 Info.align = Align(16); 4280 return true; 4281 } 4282 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: 4283 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: 4284 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: 4285 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: 4286 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: 4287 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: 4288 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: 4289 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: 4290 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: 4291 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: 4292 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: 4293 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: 4294 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: 4295 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: 4296 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: 4297 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: 4298 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: 4299 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: 4300 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: 4301 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: 4302 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: 4303 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: 4304 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: 4305 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { 4306 Info.opc = ISD::INTRINSIC_W_CHAIN; 4307 Info.memVT = MVT::v2i32; 4308 Info.ptrVal = I.getArgOperand(0); 4309 Info.offset = 0; 4310 Info.flags = MachineMemOperand::MOLoad; 4311 Info.align = Align(8); 4312 return true; 4313 } 4314 4315 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: 4316 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: 4317 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: 4318 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: 4319 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: 4320 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: 4321 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: 4322 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: 4323 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: 4324 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: 4325 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: 4326 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: 4327 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: 4328 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: 4329 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: 4330 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: 4331 4332 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: 4333 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: 4334 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: 4335 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: 4336 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: 4337 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: 4338 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: 4339 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: 4340 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: 4341 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: 4342 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: 4343 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: 4344 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: 4345 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: 4346 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: 4347 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: 4348 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: 4349 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { 4350 Info.opc = ISD::INTRINSIC_W_CHAIN; 4351 Info.memVT = MVT::v4i32; 4352 Info.ptrVal = I.getArgOperand(0); 4353 Info.offset = 0; 4354 Info.flags = MachineMemOperand::MOLoad; 4355 Info.align = Align(16); 4356 return true; 4357 } 4358 4359 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: 4360 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: 4361 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: 4362 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: 4363 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: 4364 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: 4365 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: 4366 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: 4367 4368 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: 4369 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: 4370 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: 4371 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: 4372 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: 4373 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: 4374 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: 4375 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: 4376 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: 4377 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: 4378 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: 4379 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: 4380 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: 4381 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: 4382 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: 4383 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: 4384 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: 4385 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: 4386 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: 4387 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: 4388 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: 4389 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { 4390 Info.opc = ISD::INTRINSIC_W_CHAIN; 4391 Info.memVT = MVT::i32; 4392 Info.ptrVal = I.getArgOperand(0); 4393 Info.offset = 0; 4394 Info.flags = MachineMemOperand::MOLoad; 4395 Info.align = Align(4); 4396 return true; 4397 } 4398 4399 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: 4400 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: 4401 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: 4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: 4403 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: 4404 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: 4405 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: 4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: 4407 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: 4408 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: 4409 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: 4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { 4411 Info.opc = ISD::INTRINSIC_W_CHAIN; 4412 Info.memVT = MVT::v4f16; 4413 Info.ptrVal = I.getArgOperand(0); 4414 Info.offset = 0; 4415 Info.flags = MachineMemOperand::MOLoad; 4416 Info.align = Align(16); 4417 return true; 4418 } 4419 4420 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: 4421 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: 4422 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: 4423 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: 4424 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: 4425 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: 4426 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: 4427 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: 4428 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: 4429 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: 4430 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: 4431 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: 4432 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: 4433 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: 4434 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: 4435 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { 4436 Info.opc = ISD::INTRINSIC_W_CHAIN; 4437 Info.memVT = MVT::v8f32; 4438 Info.ptrVal = I.getArgOperand(0); 4439 Info.offset = 0; 4440 Info.flags = MachineMemOperand::MOLoad; 4441 Info.align = Align(16); 4442 return true; 4443 } 4444 4445 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: 4446 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: 4447 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: 4448 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: 4449 4450 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: 4451 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: 4452 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: 4453 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: 4454 4455 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: 4456 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: 4457 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: 4458 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: 4459 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: 4460 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: 4461 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: 4462 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: 4463 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: 4464 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: 4465 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: 4466 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { 4467 Info.opc = ISD::INTRINSIC_W_CHAIN; 4468 Info.memVT = MVT::v8i32; 4469 Info.ptrVal = I.getArgOperand(0); 4470 Info.offset = 0; 4471 Info.flags = MachineMemOperand::MOLoad; 4472 Info.align = Align(16); 4473 return true; 4474 } 4475 4476 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: 4477 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: 4478 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: 4479 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: 4480 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: 4481 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: 4482 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: 4483 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: 4484 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: 4485 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { 4486 Info.opc = ISD::INTRINSIC_W_CHAIN; 4487 Info.memVT = MVT::v2i32; 4488 Info.ptrVal = I.getArgOperand(0); 4489 Info.offset = 0; 4490 Info.flags = MachineMemOperand::MOLoad; 4491 Info.align = Align(8); 4492 return true; 4493 } 4494 4495 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: 4496 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: 4497 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: 4498 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: 4499 4500 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: 4501 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: 4502 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: 4503 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { 4504 Info.opc = ISD::INTRINSIC_W_CHAIN; 4505 Info.memVT = MVT::f64; 4506 Info.ptrVal = I.getArgOperand(0); 4507 Info.offset = 0; 4508 Info.flags = MachineMemOperand::MOLoad; 4509 Info.align = Align(8); 4510 return true; 4511 } 4512 4513 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: 4514 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: 4515 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: 4516 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { 4517 Info.opc = ISD::INTRINSIC_W_CHAIN; 4518 Info.memVT = MVT::v2f64; 4519 Info.ptrVal = I.getArgOperand(0); 4520 Info.offset = 0; 4521 Info.flags = MachineMemOperand::MOLoad; 4522 Info.align = Align(16); 4523 return true; 4524 } 4525 4526 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: 4527 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: 4528 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: 4529 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: 4530 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: 4531 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: 4532 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: 4533 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: 4534 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: 4535 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: 4536 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: 4537 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { 4538 Info.opc = ISD::INTRINSIC_VOID; 4539 Info.memVT = MVT::v4f16; 4540 Info.ptrVal = I.getArgOperand(0); 4541 Info.offset = 0; 4542 Info.flags = MachineMemOperand::MOStore; 4543 Info.align = Align(16); 4544 return true; 4545 } 4546 4547 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: 4548 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: 4549 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: 4550 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: 4551 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: 4552 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: 4553 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: 4554 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: 4555 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: 4556 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: 4557 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: 4558 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: 4559 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: 4560 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: 4561 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: 4562 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { 4563 Info.opc = ISD::INTRINSIC_VOID; 4564 Info.memVT = MVT::v8f32; 4565 Info.ptrVal = I.getArgOperand(0); 4566 Info.offset = 0; 4567 Info.flags = MachineMemOperand::MOStore; 4568 Info.align = Align(16); 4569 return true; 4570 } 4571 4572 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: 4573 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: 4574 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: 4575 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: 4576 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: 4577 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: 4578 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: 4579 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: 4580 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: 4581 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: 4582 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: 4583 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { 4584 Info.opc = ISD::INTRINSIC_VOID; 4585 Info.memVT = MVT::v8i32; 4586 Info.ptrVal = I.getArgOperand(0); 4587 Info.offset = 0; 4588 Info.flags = MachineMemOperand::MOStore; 4589 Info.align = Align(16); 4590 return true; 4591 } 4592 4593 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: 4594 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: 4595 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: 4596 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: 4597 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: 4598 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: 4599 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: 4600 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { 4601 Info.opc = ISD::INTRINSIC_VOID; 4602 Info.memVT = MVT::v2i32; 4603 Info.ptrVal = I.getArgOperand(0); 4604 Info.offset = 0; 4605 Info.flags = MachineMemOperand::MOStore; 4606 Info.align = Align(8); 4607 return true; 4608 } 4609 4610 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: 4611 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: 4612 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: 4613 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { 4614 Info.opc = ISD::INTRINSIC_VOID; 4615 Info.memVT = MVT::v2f64; 4616 Info.ptrVal = I.getArgOperand(0); 4617 Info.offset = 0; 4618 Info.flags = MachineMemOperand::MOStore; 4619 Info.align = Align(16); 4620 return true; 4621 } 4622 4623 case Intrinsic::nvvm_atomic_load_inc_32: 4624 case Intrinsic::nvvm_atomic_load_dec_32: 4625 4626 case Intrinsic::nvvm_atomic_add_gen_f_cta: 4627 case Intrinsic::nvvm_atomic_add_gen_f_sys: 4628 case Intrinsic::nvvm_atomic_add_gen_i_cta: 4629 case Intrinsic::nvvm_atomic_add_gen_i_sys: 4630 case Intrinsic::nvvm_atomic_and_gen_i_cta: 4631 case Intrinsic::nvvm_atomic_and_gen_i_sys: 4632 case Intrinsic::nvvm_atomic_cas_gen_i_cta: 4633 case Intrinsic::nvvm_atomic_cas_gen_i_sys: 4634 case Intrinsic::nvvm_atomic_dec_gen_i_cta: 4635 case Intrinsic::nvvm_atomic_dec_gen_i_sys: 4636 case Intrinsic::nvvm_atomic_inc_gen_i_cta: 4637 case Intrinsic::nvvm_atomic_inc_gen_i_sys: 4638 case Intrinsic::nvvm_atomic_max_gen_i_cta: 4639 case Intrinsic::nvvm_atomic_max_gen_i_sys: 4640 case Intrinsic::nvvm_atomic_min_gen_i_cta: 4641 case Intrinsic::nvvm_atomic_min_gen_i_sys: 4642 case Intrinsic::nvvm_atomic_or_gen_i_cta: 4643 case Intrinsic::nvvm_atomic_or_gen_i_sys: 4644 case Intrinsic::nvvm_atomic_exch_gen_i_cta: 4645 case Intrinsic::nvvm_atomic_exch_gen_i_sys: 4646 case Intrinsic::nvvm_atomic_xor_gen_i_cta: 4647 case Intrinsic::nvvm_atomic_xor_gen_i_sys: { 4648 auto &DL = I.getDataLayout(); 4649 Info.opc = ISD::INTRINSIC_W_CHAIN; 4650 Info.memVT = getValueType(DL, I.getType()); 4651 Info.ptrVal = I.getArgOperand(0); 4652 Info.offset = 0; 4653 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; 4654 Info.align.reset(); 4655 return true; 4656 } 4657 4658 case Intrinsic::nvvm_ldu_global_i: 4659 case Intrinsic::nvvm_ldu_global_f: 4660 case Intrinsic::nvvm_ldu_global_p: { 4661 auto &DL = I.getDataLayout(); 4662 Info.opc = ISD::INTRINSIC_W_CHAIN; 4663 if (Intrinsic == Intrinsic::nvvm_ldu_global_i) 4664 Info.memVT = getValueType(DL, I.getType()); 4665 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) 4666 Info.memVT = getPointerTy(DL); 4667 else 4668 Info.memVT = getValueType(DL, I.getType()); 4669 Info.ptrVal = I.getArgOperand(0); 4670 Info.offset = 0; 4671 Info.flags = MachineMemOperand::MOLoad; 4672 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4673 4674 return true; 4675 } 4676 case Intrinsic::nvvm_ldg_global_i: 4677 case Intrinsic::nvvm_ldg_global_f: 4678 case Intrinsic::nvvm_ldg_global_p: { 4679 auto &DL = I.getDataLayout(); 4680 4681 Info.opc = ISD::INTRINSIC_W_CHAIN; 4682 if (Intrinsic == Intrinsic::nvvm_ldg_global_i) 4683 Info.memVT = getValueType(DL, I.getType()); 4684 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) 4685 Info.memVT = getPointerTy(DL); 4686 else 4687 Info.memVT = getValueType(DL, I.getType()); 4688 Info.ptrVal = I.getArgOperand(0); 4689 Info.offset = 0; 4690 Info.flags = MachineMemOperand::MOLoad; 4691 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); 4692 4693 return true; 4694 } 4695 4696 case Intrinsic::nvvm_tex_1d_v4f32_s32: 4697 case Intrinsic::nvvm_tex_1d_v4f32_f32: 4698 case Intrinsic::nvvm_tex_1d_level_v4f32_f32: 4699 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: 4700 case Intrinsic::nvvm_tex_1d_array_v4f32_s32: 4701 case Intrinsic::nvvm_tex_1d_array_v4f32_f32: 4702 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: 4703 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: 4704 case Intrinsic::nvvm_tex_2d_v4f32_s32: 4705 case Intrinsic::nvvm_tex_2d_v4f32_f32: 4706 case Intrinsic::nvvm_tex_2d_level_v4f32_f32: 4707 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: 4708 case Intrinsic::nvvm_tex_2d_array_v4f32_s32: 4709 case Intrinsic::nvvm_tex_2d_array_v4f32_f32: 4710 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: 4711 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: 4712 case Intrinsic::nvvm_tex_3d_v4f32_s32: 4713 case Intrinsic::nvvm_tex_3d_v4f32_f32: 4714 case Intrinsic::nvvm_tex_3d_level_v4f32_f32: 4715 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: 4716 case Intrinsic::nvvm_tex_cube_v4f32_f32: 4717 case Intrinsic::nvvm_tex_cube_level_v4f32_f32: 4718 case Intrinsic::nvvm_tex_cube_array_v4f32_f32: 4719 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: 4720 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: 4721 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: 4722 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: 4723 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: 4724 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: 4725 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: 4726 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: 4727 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: 4728 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: 4729 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: 4730 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: 4731 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: 4732 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: 4733 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: 4734 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: 4735 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: 4736 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: 4737 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: 4738 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: 4739 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: 4740 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: 4741 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: 4742 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: 4743 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: 4744 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: 4745 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: 4746 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: 4747 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: 4748 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: 4749 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: 4750 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: 4751 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: 4752 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: 4753 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: 4754 Info.opc = getOpcForTextureInstr(Intrinsic); 4755 Info.memVT = MVT::v4f32; 4756 Info.ptrVal = nullptr; 4757 Info.offset = 0; 4758 Info.flags = MachineMemOperand::MOLoad; 4759 Info.align = Align(16); 4760 return true; 4761 4762 case Intrinsic::nvvm_tex_1d_v4s32_s32: 4763 case Intrinsic::nvvm_tex_1d_v4s32_f32: 4764 case Intrinsic::nvvm_tex_1d_level_v4s32_f32: 4765 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: 4766 case Intrinsic::nvvm_tex_1d_array_v4s32_s32: 4767 case Intrinsic::nvvm_tex_1d_array_v4s32_f32: 4768 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: 4769 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: 4770 case Intrinsic::nvvm_tex_2d_v4s32_s32: 4771 case Intrinsic::nvvm_tex_2d_v4s32_f32: 4772 case Intrinsic::nvvm_tex_2d_level_v4s32_f32: 4773 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: 4774 case Intrinsic::nvvm_tex_2d_array_v4s32_s32: 4775 case Intrinsic::nvvm_tex_2d_array_v4s32_f32: 4776 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: 4777 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: 4778 case Intrinsic::nvvm_tex_3d_v4s32_s32: 4779 case Intrinsic::nvvm_tex_3d_v4s32_f32: 4780 case Intrinsic::nvvm_tex_3d_level_v4s32_f32: 4781 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: 4782 case Intrinsic::nvvm_tex_cube_v4s32_f32: 4783 case Intrinsic::nvvm_tex_cube_level_v4s32_f32: 4784 case Intrinsic::nvvm_tex_cube_array_v4s32_f32: 4785 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: 4786 case Intrinsic::nvvm_tex_cube_v4u32_f32: 4787 case Intrinsic::nvvm_tex_cube_level_v4u32_f32: 4788 case Intrinsic::nvvm_tex_cube_array_v4u32_f32: 4789 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: 4790 case Intrinsic::nvvm_tex_1d_v4u32_s32: 4791 case Intrinsic::nvvm_tex_1d_v4u32_f32: 4792 case Intrinsic::nvvm_tex_1d_level_v4u32_f32: 4793 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: 4794 case Intrinsic::nvvm_tex_1d_array_v4u32_s32: 4795 case Intrinsic::nvvm_tex_1d_array_v4u32_f32: 4796 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: 4797 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: 4798 case Intrinsic::nvvm_tex_2d_v4u32_s32: 4799 case Intrinsic::nvvm_tex_2d_v4u32_f32: 4800 case Intrinsic::nvvm_tex_2d_level_v4u32_f32: 4801 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: 4802 case Intrinsic::nvvm_tex_2d_array_v4u32_s32: 4803 case Intrinsic::nvvm_tex_2d_array_v4u32_f32: 4804 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: 4805 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: 4806 case Intrinsic::nvvm_tex_3d_v4u32_s32: 4807 case Intrinsic::nvvm_tex_3d_v4u32_f32: 4808 case Intrinsic::nvvm_tex_3d_level_v4u32_f32: 4809 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: 4810 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: 4811 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: 4812 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: 4813 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: 4814 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: 4815 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: 4816 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: 4817 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: 4818 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: 4819 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: 4820 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: 4821 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: 4822 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: 4823 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: 4824 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: 4825 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: 4826 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: 4827 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: 4828 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: 4829 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: 4830 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: 4831 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: 4832 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: 4833 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: 4834 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: 4835 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: 4836 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: 4837 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: 4838 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: 4839 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: 4840 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: 4841 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: 4842 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: 4843 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: 4844 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: 4845 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: 4846 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: 4847 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: 4848 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: 4849 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: 4850 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: 4851 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: 4852 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: 4853 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: 4854 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: 4855 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: 4856 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: 4857 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: 4858 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: 4859 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: 4860 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: 4861 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: 4862 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: 4863 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: 4864 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: 4865 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: 4866 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: 4867 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: 4868 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: 4869 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: 4870 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: 4871 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: 4872 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: 4873 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: 4874 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: 4875 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: 4876 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: 4877 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: 4878 Info.opc = getOpcForTextureInstr(Intrinsic); 4879 Info.memVT = MVT::v4i32; 4880 Info.ptrVal = nullptr; 4881 Info.offset = 0; 4882 Info.flags = MachineMemOperand::MOLoad; 4883 Info.align = Align(16); 4884 return true; 4885 4886 case Intrinsic::nvvm_suld_1d_i8_clamp: 4887 case Intrinsic::nvvm_suld_1d_v2i8_clamp: 4888 case Intrinsic::nvvm_suld_1d_v4i8_clamp: 4889 case Intrinsic::nvvm_suld_1d_array_i8_clamp: 4890 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: 4891 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: 4892 case Intrinsic::nvvm_suld_2d_i8_clamp: 4893 case Intrinsic::nvvm_suld_2d_v2i8_clamp: 4894 case Intrinsic::nvvm_suld_2d_v4i8_clamp: 4895 case Intrinsic::nvvm_suld_2d_array_i8_clamp: 4896 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: 4897 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: 4898 case Intrinsic::nvvm_suld_3d_i8_clamp: 4899 case Intrinsic::nvvm_suld_3d_v2i8_clamp: 4900 case Intrinsic::nvvm_suld_3d_v4i8_clamp: 4901 case Intrinsic::nvvm_suld_1d_i8_trap: 4902 case Intrinsic::nvvm_suld_1d_v2i8_trap: 4903 case Intrinsic::nvvm_suld_1d_v4i8_trap: 4904 case Intrinsic::nvvm_suld_1d_array_i8_trap: 4905 case Intrinsic::nvvm_suld_1d_array_v2i8_trap: 4906 case Intrinsic::nvvm_suld_1d_array_v4i8_trap: 4907 case Intrinsic::nvvm_suld_2d_i8_trap: 4908 case Intrinsic::nvvm_suld_2d_v2i8_trap: 4909 case Intrinsic::nvvm_suld_2d_v4i8_trap: 4910 case Intrinsic::nvvm_suld_2d_array_i8_trap: 4911 case Intrinsic::nvvm_suld_2d_array_v2i8_trap: 4912 case Intrinsic::nvvm_suld_2d_array_v4i8_trap: 4913 case Intrinsic::nvvm_suld_3d_i8_trap: 4914 case Intrinsic::nvvm_suld_3d_v2i8_trap: 4915 case Intrinsic::nvvm_suld_3d_v4i8_trap: 4916 case Intrinsic::nvvm_suld_1d_i8_zero: 4917 case Intrinsic::nvvm_suld_1d_v2i8_zero: 4918 case Intrinsic::nvvm_suld_1d_v4i8_zero: 4919 case Intrinsic::nvvm_suld_1d_array_i8_zero: 4920 case Intrinsic::nvvm_suld_1d_array_v2i8_zero: 4921 case Intrinsic::nvvm_suld_1d_array_v4i8_zero: 4922 case Intrinsic::nvvm_suld_2d_i8_zero: 4923 case Intrinsic::nvvm_suld_2d_v2i8_zero: 4924 case Intrinsic::nvvm_suld_2d_v4i8_zero: 4925 case Intrinsic::nvvm_suld_2d_array_i8_zero: 4926 case Intrinsic::nvvm_suld_2d_array_v2i8_zero: 4927 case Intrinsic::nvvm_suld_2d_array_v4i8_zero: 4928 case Intrinsic::nvvm_suld_3d_i8_zero: 4929 case Intrinsic::nvvm_suld_3d_v2i8_zero: 4930 case Intrinsic::nvvm_suld_3d_v4i8_zero: 4931 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4932 Info.memVT = MVT::i8; 4933 Info.ptrVal = nullptr; 4934 Info.offset = 0; 4935 Info.flags = MachineMemOperand::MOLoad; 4936 Info.align = Align(16); 4937 return true; 4938 4939 case Intrinsic::nvvm_suld_1d_i16_clamp: 4940 case Intrinsic::nvvm_suld_1d_v2i16_clamp: 4941 case Intrinsic::nvvm_suld_1d_v4i16_clamp: 4942 case Intrinsic::nvvm_suld_1d_array_i16_clamp: 4943 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: 4944 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: 4945 case Intrinsic::nvvm_suld_2d_i16_clamp: 4946 case Intrinsic::nvvm_suld_2d_v2i16_clamp: 4947 case Intrinsic::nvvm_suld_2d_v4i16_clamp: 4948 case Intrinsic::nvvm_suld_2d_array_i16_clamp: 4949 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: 4950 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: 4951 case Intrinsic::nvvm_suld_3d_i16_clamp: 4952 case Intrinsic::nvvm_suld_3d_v2i16_clamp: 4953 case Intrinsic::nvvm_suld_3d_v4i16_clamp: 4954 case Intrinsic::nvvm_suld_1d_i16_trap: 4955 case Intrinsic::nvvm_suld_1d_v2i16_trap: 4956 case Intrinsic::nvvm_suld_1d_v4i16_trap: 4957 case Intrinsic::nvvm_suld_1d_array_i16_trap: 4958 case Intrinsic::nvvm_suld_1d_array_v2i16_trap: 4959 case Intrinsic::nvvm_suld_1d_array_v4i16_trap: 4960 case Intrinsic::nvvm_suld_2d_i16_trap: 4961 case Intrinsic::nvvm_suld_2d_v2i16_trap: 4962 case Intrinsic::nvvm_suld_2d_v4i16_trap: 4963 case Intrinsic::nvvm_suld_2d_array_i16_trap: 4964 case Intrinsic::nvvm_suld_2d_array_v2i16_trap: 4965 case Intrinsic::nvvm_suld_2d_array_v4i16_trap: 4966 case Intrinsic::nvvm_suld_3d_i16_trap: 4967 case Intrinsic::nvvm_suld_3d_v2i16_trap: 4968 case Intrinsic::nvvm_suld_3d_v4i16_trap: 4969 case Intrinsic::nvvm_suld_1d_i16_zero: 4970 case Intrinsic::nvvm_suld_1d_v2i16_zero: 4971 case Intrinsic::nvvm_suld_1d_v4i16_zero: 4972 case Intrinsic::nvvm_suld_1d_array_i16_zero: 4973 case Intrinsic::nvvm_suld_1d_array_v2i16_zero: 4974 case Intrinsic::nvvm_suld_1d_array_v4i16_zero: 4975 case Intrinsic::nvvm_suld_2d_i16_zero: 4976 case Intrinsic::nvvm_suld_2d_v2i16_zero: 4977 case Intrinsic::nvvm_suld_2d_v4i16_zero: 4978 case Intrinsic::nvvm_suld_2d_array_i16_zero: 4979 case Intrinsic::nvvm_suld_2d_array_v2i16_zero: 4980 case Intrinsic::nvvm_suld_2d_array_v4i16_zero: 4981 case Intrinsic::nvvm_suld_3d_i16_zero: 4982 case Intrinsic::nvvm_suld_3d_v2i16_zero: 4983 case Intrinsic::nvvm_suld_3d_v4i16_zero: 4984 Info.opc = getOpcForSurfaceInstr(Intrinsic); 4985 Info.memVT = MVT::i16; 4986 Info.ptrVal = nullptr; 4987 Info.offset = 0; 4988 Info.flags = MachineMemOperand::MOLoad; 4989 Info.align = Align(16); 4990 return true; 4991 4992 case Intrinsic::nvvm_suld_1d_i32_clamp: 4993 case Intrinsic::nvvm_suld_1d_v2i32_clamp: 4994 case Intrinsic::nvvm_suld_1d_v4i32_clamp: 4995 case Intrinsic::nvvm_suld_1d_array_i32_clamp: 4996 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: 4997 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: 4998 case Intrinsic::nvvm_suld_2d_i32_clamp: 4999 case Intrinsic::nvvm_suld_2d_v2i32_clamp: 5000 case Intrinsic::nvvm_suld_2d_v4i32_clamp: 5001 case Intrinsic::nvvm_suld_2d_array_i32_clamp: 5002 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: 5003 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: 5004 case Intrinsic::nvvm_suld_3d_i32_clamp: 5005 case Intrinsic::nvvm_suld_3d_v2i32_clamp: 5006 case Intrinsic::nvvm_suld_3d_v4i32_clamp: 5007 case Intrinsic::nvvm_suld_1d_i32_trap: 5008 case Intrinsic::nvvm_suld_1d_v2i32_trap: 5009 case Intrinsic::nvvm_suld_1d_v4i32_trap: 5010 case Intrinsic::nvvm_suld_1d_array_i32_trap: 5011 case Intrinsic::nvvm_suld_1d_array_v2i32_trap: 5012 case Intrinsic::nvvm_suld_1d_array_v4i32_trap: 5013 case Intrinsic::nvvm_suld_2d_i32_trap: 5014 case Intrinsic::nvvm_suld_2d_v2i32_trap: 5015 case Intrinsic::nvvm_suld_2d_v4i32_trap: 5016 case Intrinsic::nvvm_suld_2d_array_i32_trap: 5017 case Intrinsic::nvvm_suld_2d_array_v2i32_trap: 5018 case Intrinsic::nvvm_suld_2d_array_v4i32_trap: 5019 case Intrinsic::nvvm_suld_3d_i32_trap: 5020 case Intrinsic::nvvm_suld_3d_v2i32_trap: 5021 case Intrinsic::nvvm_suld_3d_v4i32_trap: 5022 case Intrinsic::nvvm_suld_1d_i32_zero: 5023 case Intrinsic::nvvm_suld_1d_v2i32_zero: 5024 case Intrinsic::nvvm_suld_1d_v4i32_zero: 5025 case Intrinsic::nvvm_suld_1d_array_i32_zero: 5026 case Intrinsic::nvvm_suld_1d_array_v2i32_zero: 5027 case Intrinsic::nvvm_suld_1d_array_v4i32_zero: 5028 case Intrinsic::nvvm_suld_2d_i32_zero: 5029 case Intrinsic::nvvm_suld_2d_v2i32_zero: 5030 case Intrinsic::nvvm_suld_2d_v4i32_zero: 5031 case Intrinsic::nvvm_suld_2d_array_i32_zero: 5032 case Intrinsic::nvvm_suld_2d_array_v2i32_zero: 5033 case Intrinsic::nvvm_suld_2d_array_v4i32_zero: 5034 case Intrinsic::nvvm_suld_3d_i32_zero: 5035 case Intrinsic::nvvm_suld_3d_v2i32_zero: 5036 case Intrinsic::nvvm_suld_3d_v4i32_zero: 5037 Info.opc = getOpcForSurfaceInstr(Intrinsic); 5038 Info.memVT = MVT::i32; 5039 Info.ptrVal = nullptr; 5040 Info.offset = 0; 5041 Info.flags = MachineMemOperand::MOLoad; 5042 Info.align = Align(16); 5043 return true; 5044 5045 case Intrinsic::nvvm_suld_1d_i64_clamp: 5046 case Intrinsic::nvvm_suld_1d_v2i64_clamp: 5047 case Intrinsic::nvvm_suld_1d_array_i64_clamp: 5048 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: 5049 case Intrinsic::nvvm_suld_2d_i64_clamp: 5050 case Intrinsic::nvvm_suld_2d_v2i64_clamp: 5051 case Intrinsic::nvvm_suld_2d_array_i64_clamp: 5052 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: 5053 case Intrinsic::nvvm_suld_3d_i64_clamp: 5054 case Intrinsic::nvvm_suld_3d_v2i64_clamp: 5055 case Intrinsic::nvvm_suld_1d_i64_trap: 5056 case Intrinsic::nvvm_suld_1d_v2i64_trap: 5057 case Intrinsic::nvvm_suld_1d_array_i64_trap: 5058 case Intrinsic::nvvm_suld_1d_array_v2i64_trap: 5059 case Intrinsic::nvvm_suld_2d_i64_trap: 5060 case Intrinsic::nvvm_suld_2d_v2i64_trap: 5061 case Intrinsic::nvvm_suld_2d_array_i64_trap: 5062 case Intrinsic::nvvm_suld_2d_array_v2i64_trap: 5063 case Intrinsic::nvvm_suld_3d_i64_trap: 5064 case Intrinsic::nvvm_suld_3d_v2i64_trap: 5065 case Intrinsic::nvvm_suld_1d_i64_zero: 5066 case Intrinsic::nvvm_suld_1d_v2i64_zero: 5067 case Intrinsic::nvvm_suld_1d_array_i64_zero: 5068 case Intrinsic::nvvm_suld_1d_array_v2i64_zero: 5069 case Intrinsic::nvvm_suld_2d_i64_zero: 5070 case Intrinsic::nvvm_suld_2d_v2i64_zero: 5071 case Intrinsic::nvvm_suld_2d_array_i64_zero: 5072 case Intrinsic::nvvm_suld_2d_array_v2i64_zero: 5073 case Intrinsic::nvvm_suld_3d_i64_zero: 5074 case Intrinsic::nvvm_suld_3d_v2i64_zero: 5075 Info.opc = getOpcForSurfaceInstr(Intrinsic); 5076 Info.memVT = MVT::i64; 5077 Info.ptrVal = nullptr; 5078 Info.offset = 0; 5079 Info.flags = MachineMemOperand::MOLoad; 5080 Info.align = Align(16); 5081 return true; 5082 } 5083 return false; 5084 } 5085 5086 /// getFunctionParamOptimizedAlign - since function arguments are passed via 5087 /// .param space, we may want to increase their alignment in a way that 5088 /// ensures that we can effectively vectorize their loads & stores. We can 5089 /// increase alignment only if the function has internal or has private 5090 /// linkage as for other linkage types callers may already rely on default 5091 /// alignment. To allow using 128-bit vectorized loads/stores, this function 5092 /// ensures that alignment is 16 or greater. 5093 Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( 5094 const Function *F, Type *ArgTy, const DataLayout &DL) const { 5095 // Capping the alignment to 128 bytes as that is the maximum alignment 5096 // supported by PTX. 5097 const Align ABITypeAlign = std::min(Align(128), DL.getABITypeAlign(ArgTy)); 5098 5099 // If a function has linkage different from internal or private, we 5100 // must use default ABI alignment as external users rely on it. Same 5101 // for a function that may be called from a function pointer. 5102 if (!F || !F->hasLocalLinkage() || 5103 F->hasAddressTaken(/*Users=*/nullptr, 5104 /*IgnoreCallbackUses=*/false, 5105 /*IgnoreAssumeLikeCalls=*/true, 5106 /*IgnoreLLVMUsed=*/true)) 5107 return ABITypeAlign; 5108 5109 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); 5110 return std::max(Align(16), ABITypeAlign); 5111 } 5112 5113 /// Helper for computing alignment of a device function byval parameter. 5114 Align NVPTXTargetLowering::getFunctionByValParamAlign( 5115 const Function *F, Type *ArgTy, Align InitialAlign, 5116 const DataLayout &DL) const { 5117 Align ArgAlign = InitialAlign; 5118 // Try to increase alignment to enhance vectorization options. 5119 if (F) 5120 ArgAlign = std::max(ArgAlign, getFunctionParamOptimizedAlign(F, ArgTy, DL)); 5121 5122 // Old ptx versions have a bug. When PTX code takes address of 5123 // byval parameter with alignment < 4, ptxas generates code to 5124 // spill argument into memory. Alas on sm_50+ ptxas generates 5125 // SASS code that fails with misaligned access. To work around 5126 // the problem, make sure that we align byval parameters by at 5127 // least 4. This bug seems to be fixed at least starting from 5128 // ptxas > 9.0. 5129 // TODO: remove this after verifying the bug is not reproduced 5130 // on non-deprecated ptxas versions. 5131 if (ForceMinByValParamAlign) 5132 ArgAlign = std::max(ArgAlign, Align(4)); 5133 5134 return ArgAlign; 5135 } 5136 5137 // Helper for getting a function parameter name. Name is composed from 5138 // its index and the function name. Negative index corresponds to special 5139 // parameter (unsized array) used for passing variable arguments. 5140 std::string NVPTXTargetLowering::getParamName(const Function *F, 5141 int Idx) const { 5142 std::string ParamName; 5143 raw_string_ostream ParamStr(ParamName); 5144 5145 ParamStr << getTargetMachine().getSymbol(F)->getName(); 5146 if (Idx < 0) 5147 ParamStr << "_vararg"; 5148 else 5149 ParamStr << "_param_" << Idx; 5150 5151 return ParamName; 5152 } 5153 5154 /// isLegalAddressingMode - Return true if the addressing mode represented 5155 /// by AM is legal for this target, for a load/store of the specified type. 5156 /// Used to guide target specific optimizations, like loop strength reduction 5157 /// (LoopStrengthReduce.cpp) and memory optimization for address mode 5158 /// (CodeGenPrepare.cpp) 5159 bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, 5160 const AddrMode &AM, Type *Ty, 5161 unsigned AS, Instruction *I) const { 5162 // AddrMode - This represents an addressing mode of: 5163 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 5164 // 5165 // The legal address modes are 5166 // - [avar] 5167 // - [areg] 5168 // - [areg+immoff] 5169 // - [immAddr] 5170 5171 // immoff must fit in a signed 32-bit int 5172 if (!APInt(64, AM.BaseOffs).isSignedIntN(32)) 5173 return false; 5174 5175 if (AM.BaseGV) 5176 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; 5177 5178 switch (AM.Scale) { 5179 case 0: // "r", "r+i" or "i" is allowed 5180 break; 5181 case 1: 5182 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. 5183 return false; 5184 // Otherwise we have r+i. 5185 break; 5186 default: 5187 // No scale > 1 is allowed 5188 return false; 5189 } 5190 return true; 5191 } 5192 5193 //===----------------------------------------------------------------------===// 5194 // NVPTX Inline Assembly Support 5195 //===----------------------------------------------------------------------===// 5196 5197 /// getConstraintType - Given a constraint letter, return the type of 5198 /// constraint it is for this target. 5199 NVPTXTargetLowering::ConstraintType 5200 NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { 5201 if (Constraint.size() == 1) { 5202 switch (Constraint[0]) { 5203 default: 5204 break; 5205 case 'b': 5206 case 'r': 5207 case 'h': 5208 case 'c': 5209 case 'l': 5210 case 'f': 5211 case 'd': 5212 case 'q': 5213 case '0': 5214 case 'N': 5215 return C_RegisterClass; 5216 } 5217 } 5218 return TargetLowering::getConstraintType(Constraint); 5219 } 5220 5221 std::pair<unsigned, const TargetRegisterClass *> 5222 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, 5223 StringRef Constraint, 5224 MVT VT) const { 5225 if (Constraint.size() == 1) { 5226 switch (Constraint[0]) { 5227 case 'b': 5228 return std::make_pair(0U, &NVPTX::Int1RegsRegClass); 5229 case 'c': 5230 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5231 case 'h': 5232 return std::make_pair(0U, &NVPTX::Int16RegsRegClass); 5233 case 'r': 5234 return std::make_pair(0U, &NVPTX::Int32RegsRegClass); 5235 case 'l': 5236 case 'N': 5237 return std::make_pair(0U, &NVPTX::Int64RegsRegClass); 5238 case 'q': { 5239 if (STI.getSmVersion() < 70) 5240 report_fatal_error("Inline asm with 128 bit operands is only " 5241 "supported for sm_70 and higher!"); 5242 return std::make_pair(0U, &NVPTX::Int128RegsRegClass); 5243 } 5244 case 'f': 5245 return std::make_pair(0U, &NVPTX::Float32RegsRegClass); 5246 case 'd': 5247 return std::make_pair(0U, &NVPTX::Float64RegsRegClass); 5248 } 5249 } 5250 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 5251 } 5252 5253 //===----------------------------------------------------------------------===// 5254 // NVPTX DAG Combining 5255 //===----------------------------------------------------------------------===// 5256 5257 bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, 5258 CodeGenOptLevel OptLevel) const { 5259 // Always honor command-line argument 5260 if (FMAContractLevelOpt.getNumOccurrences() > 0) 5261 return FMAContractLevelOpt > 0; 5262 5263 // Do not contract if we're not optimizing the code. 5264 if (OptLevel == CodeGenOptLevel::None) 5265 return false; 5266 5267 // Honor TargetOptions flags that explicitly say fusion is okay. 5268 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) 5269 return true; 5270 5271 return allowUnsafeFPMath(MF); 5272 } 5273 5274 bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { 5275 // Honor TargetOptions flags that explicitly say unsafe math is okay. 5276 if (MF.getTarget().Options.UnsafeFPMath) 5277 return true; 5278 5279 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. 5280 const Function &F = MF.getFunction(); 5281 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); 5282 } 5283 5284 static bool isConstZero(const SDValue &Operand) { 5285 const auto *Const = dyn_cast<ConstantSDNode>(Operand); 5286 return Const && Const->getZExtValue() == 0; 5287 } 5288 5289 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 5290 /// operands N0 and N1. This is a helper for PerformADDCombine that is 5291 /// called with the default operands, and if that fails, with commuted 5292 /// operands. 5293 static SDValue 5294 PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5295 TargetLowering::DAGCombinerInfo &DCI) { 5296 EVT VT = N0.getValueType(); 5297 5298 // Since integer multiply-add costs the same as integer multiply 5299 // but is more costly than integer add, do the fusion only when 5300 // the mul is only used in the add. 5301 // TODO: this may not be true for later architectures, consider relaxing this 5302 if (!N0.getNode()->hasOneUse()) 5303 return SDValue(); 5304 5305 // fold (add (mul a, b), c) -> (mad a, b, c) 5306 // 5307 if (N0.getOpcode() == ISD::MUL) 5308 return DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, N0.getOperand(0), 5309 N0.getOperand(1), N1); 5310 5311 // fold (add (select cond, 0, (mul a, b)), c) 5312 // -> (select cond, c, (mad a, b, c)) 5313 // 5314 if (N0.getOpcode() == ISD::SELECT) { 5315 unsigned ZeroOpNum; 5316 if (isConstZero(N0->getOperand(1))) 5317 ZeroOpNum = 1; 5318 else if (isConstZero(N0->getOperand(2))) 5319 ZeroOpNum = 2; 5320 else 5321 return SDValue(); 5322 5323 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1); 5324 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) 5325 return SDValue(); 5326 5327 SDValue MAD = DCI.DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT, 5328 M->getOperand(0), M->getOperand(1), N1); 5329 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0), 5330 ((ZeroOpNum == 1) ? N1 : MAD), 5331 ((ZeroOpNum == 1) ? MAD : N1)); 5332 } 5333 5334 return SDValue(); 5335 } 5336 5337 static SDValue 5338 PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5339 TargetLowering::DAGCombinerInfo &DCI, 5340 CodeGenOptLevel OptLevel) { 5341 EVT VT = N0.getValueType(); 5342 if (N0.getOpcode() == ISD::FMUL) { 5343 const auto *TLI = static_cast<const NVPTXTargetLowering *>( 5344 &DCI.DAG.getTargetLoweringInfo()); 5345 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel)) 5346 return SDValue(); 5347 5348 // For floating point: 5349 // Do the fusion only when the mul has less than 5 uses and all 5350 // are add. 5351 // The heuristic is that if a use is not an add, then that use 5352 // cannot be fused into fma, therefore mul is still needed anyway. 5353 // If there are more than 4 uses, even if they are all add, fusing 5354 // them will increase register pressue. 5355 // 5356 int numUses = 0; 5357 int nonAddCount = 0; 5358 for (const SDNode *User : N0.getNode()->uses()) { 5359 numUses++; 5360 if (User->getOpcode() != ISD::FADD) 5361 ++nonAddCount; 5362 if (numUses >= 5) 5363 return SDValue(); 5364 } 5365 if (nonAddCount) { 5366 int orderNo = N->getIROrder(); 5367 int orderNo2 = N0.getNode()->getIROrder(); 5368 // simple heuristics here for considering potential register 5369 // pressure, the logics here is that the differnce are used 5370 // to measure the distance between def and use, the longer distance 5371 // more likely cause register pressure. 5372 if (orderNo - orderNo2 < 500) 5373 return SDValue(); 5374 5375 // Now, check if at least one of the FMUL's operands is live beyond the 5376 // node N, which guarantees that the FMA will not increase register 5377 // pressure at node N. 5378 bool opIsLive = false; 5379 const SDNode *left = N0.getOperand(0).getNode(); 5380 const SDNode *right = N0.getOperand(1).getNode(); 5381 5382 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right)) 5383 opIsLive = true; 5384 5385 if (!opIsLive) 5386 for (const SDNode *User : left->uses()) { 5387 int orderNo3 = User->getIROrder(); 5388 if (orderNo3 > orderNo) { 5389 opIsLive = true; 5390 break; 5391 } 5392 } 5393 5394 if (!opIsLive) 5395 for (const SDNode *User : right->uses()) { 5396 int orderNo3 = User->getIROrder(); 5397 if (orderNo3 > orderNo) { 5398 opIsLive = true; 5399 break; 5400 } 5401 } 5402 5403 if (!opIsLive) 5404 return SDValue(); 5405 } 5406 5407 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), 5408 N0.getOperand(1), N1); 5409 } 5410 5411 return SDValue(); 5412 } 5413 5414 static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, 5415 std::size_t Back) { 5416 if (all_of(N->ops().drop_front(Front).drop_back(Back), 5417 [](const SDUse &U) { return U.get()->isUndef(); })) 5418 // Operand 0 is the previous value in the chain. Cannot return EntryToken 5419 // as the previous value will become unused and eliminated later. 5420 return N->getOperand(0); 5421 5422 return SDValue(); 5423 } 5424 5425 static SDValue PerformStoreParamCombine(SDNode *N) { 5426 // Operands from the 3rd to the 2nd last one are the values to be stored. 5427 // {Chain, ArgID, Offset, Val, Glue} 5428 return PerformStoreCombineHelper(N, 3, 1); 5429 } 5430 5431 static SDValue PerformStoreRetvalCombine(SDNode *N) { 5432 // Operands from the 2nd to the last one are the values to be stored 5433 return PerformStoreCombineHelper(N, 2, 0); 5434 } 5435 5436 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 5437 /// 5438 static SDValue PerformADDCombine(SDNode *N, 5439 TargetLowering::DAGCombinerInfo &DCI, 5440 CodeGenOptLevel OptLevel) { 5441 if (OptLevel == CodeGenOptLevel::None) 5442 return SDValue(); 5443 5444 SDValue N0 = N->getOperand(0); 5445 SDValue N1 = N->getOperand(1); 5446 5447 // Skip non-integer, non-scalar case 5448 EVT VT = N0.getValueType(); 5449 if (VT.isVector() || VT != MVT::i32) 5450 return SDValue(); 5451 5452 // First try with the default operand order. 5453 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI)) 5454 return Result; 5455 5456 // If that didn't work, try again with the operands commuted. 5457 return PerformADDCombineWithOperands(N, N1, N0, DCI); 5458 } 5459 5460 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD. 5461 /// 5462 static SDValue PerformFADDCombine(SDNode *N, 5463 TargetLowering::DAGCombinerInfo &DCI, 5464 CodeGenOptLevel OptLevel) { 5465 SDValue N0 = N->getOperand(0); 5466 SDValue N1 = N->getOperand(1); 5467 5468 EVT VT = N0.getValueType(); 5469 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) 5470 return SDValue(); 5471 5472 // First try with the default operand order. 5473 if (SDValue Result = PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel)) 5474 return Result; 5475 5476 // If that didn't work, try again with the operands commuted. 5477 return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel); 5478 } 5479 5480 static SDValue PerformANDCombine(SDNode *N, 5481 TargetLowering::DAGCombinerInfo &DCI) { 5482 // The type legalizer turns a vector load of i8 values into a zextload to i16 5483 // registers, optionally ANY_EXTENDs it (if target type is integer), 5484 // and ANDs off the high 8 bits. Since we turn this load into a 5485 // target-specific DAG node, the DAG combiner fails to eliminate these AND 5486 // nodes. Do that here. 5487 SDValue Val = N->getOperand(0); 5488 SDValue Mask = N->getOperand(1); 5489 5490 if (isa<ConstantSDNode>(Val)) { 5491 std::swap(Val, Mask); 5492 } 5493 5494 SDValue AExt; 5495 5496 // Convert BFE-> truncate i16 -> and 255 5497 // To just BFE-> truncate i16, as the value already has all the bits in the 5498 // right places. 5499 if (Val.getOpcode() == ISD::TRUNCATE) { 5500 SDValue BFE = Val.getOperand(0); 5501 if (BFE.getOpcode() != NVPTXISD::BFE) 5502 return SDValue(); 5503 5504 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); 5505 if (!BFEBits) 5506 return SDValue(); 5507 uint64_t BFEBitsVal = BFEBits->getZExtValue(); 5508 5509 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5510 if (!MaskCnst) { 5511 // Not an AND with a constant 5512 return SDValue(); 5513 } 5514 uint64_t MaskVal = MaskCnst->getZExtValue(); 5515 5516 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) 5517 return SDValue(); 5518 // If we get here, the AND is unnecessary. Just replace it with the trunc 5519 DCI.CombineTo(N, Val, false); 5520 } 5521 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 5522 if (Val.getOpcode() == ISD::ANY_EXTEND) { 5523 AExt = Val; 5524 Val = Val->getOperand(0); 5525 } 5526 5527 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { 5528 Val = Val->getOperand(0); 5529 } 5530 5531 if (Val->getOpcode() == NVPTXISD::LoadV2 || 5532 Val->getOpcode() == NVPTXISD::LoadV4) { 5533 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); 5534 if (!MaskCnst) { 5535 // Not an AND with a constant 5536 return SDValue(); 5537 } 5538 5539 uint64_t MaskVal = MaskCnst->getZExtValue(); 5540 if (MaskVal != 0xff) { 5541 // Not an AND that chops off top 8 bits 5542 return SDValue(); 5543 } 5544 5545 MemSDNode *Mem = dyn_cast<MemSDNode>(Val); 5546 if (!Mem) { 5547 // Not a MemSDNode?!? 5548 return SDValue(); 5549 } 5550 5551 EVT MemVT = Mem->getMemoryVT(); 5552 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { 5553 // We only handle the i8 case 5554 return SDValue(); 5555 } 5556 5557 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1); 5558 if (ExtType == ISD::SEXTLOAD) { 5559 // If for some reason the load is a sextload, the and is needed to zero 5560 // out the high 8 bits 5561 return SDValue(); 5562 } 5563 5564 bool AddTo = false; 5565 if (AExt.getNode() != nullptr) { 5566 // Re-insert the ext as a zext. 5567 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), 5568 AExt.getValueType(), Val); 5569 AddTo = true; 5570 } 5571 5572 // If we get here, the AND is unnecessary. Just replace it with the load 5573 DCI.CombineTo(N, Val, AddTo); 5574 } 5575 5576 return SDValue(); 5577 } 5578 5579 static SDValue PerformREMCombine(SDNode *N, 5580 TargetLowering::DAGCombinerInfo &DCI, 5581 CodeGenOptLevel OptLevel) { 5582 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); 5583 5584 // Don't do anything at less than -O2. 5585 if (OptLevel < CodeGenOptLevel::Default) 5586 return SDValue(); 5587 5588 SelectionDAG &DAG = DCI.DAG; 5589 SDLoc DL(N); 5590 EVT VT = N->getValueType(0); 5591 bool IsSigned = N->getOpcode() == ISD::SREM; 5592 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; 5593 5594 const SDValue &Num = N->getOperand(0); 5595 const SDValue &Den = N->getOperand(1); 5596 5597 for (const SDNode *U : Num->uses()) { 5598 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && 5599 U->getOperand(1) == Den) { 5600 // Num % Den -> Num - (Num / Den) * Den 5601 return DAG.getNode(ISD::SUB, DL, VT, Num, 5602 DAG.getNode(ISD::MUL, DL, VT, 5603 DAG.getNode(DivOpc, DL, VT, Num, Den), 5604 Den)); 5605 } 5606 } 5607 return SDValue(); 5608 } 5609 5610 enum OperandSignedness { 5611 Signed = 0, 5612 Unsigned, 5613 Unknown 5614 }; 5615 5616 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 5617 /// that can be demoted to \p OptSize bits without loss of information. The 5618 /// signedness of the operand, if determinable, is placed in \p S. 5619 static bool IsMulWideOperandDemotable(SDValue Op, 5620 unsigned OptSize, 5621 OperandSignedness &S) { 5622 S = Unknown; 5623 5624 if (Op.getOpcode() == ISD::SIGN_EXTEND || 5625 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { 5626 EVT OrigVT = Op.getOperand(0).getValueType(); 5627 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5628 S = Signed; 5629 return true; 5630 } 5631 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { 5632 EVT OrigVT = Op.getOperand(0).getValueType(); 5633 if (OrigVT.getFixedSizeInBits() <= OptSize) { 5634 S = Unsigned; 5635 return true; 5636 } 5637 } 5638 5639 return false; 5640 } 5641 5642 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 5643 /// be demoted to \p OptSize bits without loss of information. If the operands 5644 /// contain a constant, it should appear as the RHS operand. The signedness of 5645 /// the operands is placed in \p IsSigned. 5646 static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, 5647 unsigned OptSize, 5648 bool &IsSigned) { 5649 OperandSignedness LHSSign; 5650 5651 // The LHS operand must be a demotable op 5652 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign)) 5653 return false; 5654 5655 // We should have been able to determine the signedness from the LHS 5656 if (LHSSign == Unknown) 5657 return false; 5658 5659 IsSigned = (LHSSign == Signed); 5660 5661 // The RHS can be a demotable op or a constant 5662 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { 5663 const APInt &Val = CI->getAPIntValue(); 5664 if (LHSSign == Unsigned) { 5665 return Val.isIntN(OptSize); 5666 } else { 5667 return Val.isSignedIntN(OptSize); 5668 } 5669 } else { 5670 OperandSignedness RHSSign; 5671 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) 5672 return false; 5673 5674 return LHSSign == RHSSign; 5675 } 5676 } 5677 5678 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 5679 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 5680 /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 5681 /// amount. 5682 static SDValue TryMULWIDECombine(SDNode *N, 5683 TargetLowering::DAGCombinerInfo &DCI) { 5684 EVT MulType = N->getValueType(0); 5685 if (MulType != MVT::i32 && MulType != MVT::i64) { 5686 return SDValue(); 5687 } 5688 5689 SDLoc DL(N); 5690 unsigned OptSize = MulType.getSizeInBits() >> 1; 5691 SDValue LHS = N->getOperand(0); 5692 SDValue RHS = N->getOperand(1); 5693 5694 // Canonicalize the multiply so the constant (if any) is on the right 5695 if (N->getOpcode() == ISD::MUL) { 5696 if (isa<ConstantSDNode>(LHS)) { 5697 std::swap(LHS, RHS); 5698 } 5699 } 5700 5701 // If we have a SHL, determine the actual multiply amount 5702 if (N->getOpcode() == ISD::SHL) { 5703 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS); 5704 if (!ShlRHS) { 5705 return SDValue(); 5706 } 5707 5708 APInt ShiftAmt = ShlRHS->getAPIntValue(); 5709 unsigned BitWidth = MulType.getSizeInBits(); 5710 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) { 5711 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; 5712 RHS = DCI.DAG.getConstant(MulVal, DL, MulType); 5713 } else { 5714 return SDValue(); 5715 } 5716 } 5717 5718 bool Signed; 5719 // Verify that our operands are demotable 5720 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) { 5721 return SDValue(); 5722 } 5723 5724 EVT DemotedVT; 5725 if (MulType == MVT::i32) { 5726 DemotedVT = MVT::i16; 5727 } else { 5728 DemotedVT = MVT::i32; 5729 } 5730 5731 // Truncate the operands to the correct size. Note that these are just for 5732 // type consistency and will (likely) be eliminated in later phases. 5733 SDValue TruncLHS = 5734 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS); 5735 SDValue TruncRHS = 5736 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS); 5737 5738 unsigned Opc; 5739 if (Signed) { 5740 Opc = NVPTXISD::MUL_WIDE_SIGNED; 5741 } else { 5742 Opc = NVPTXISD::MUL_WIDE_UNSIGNED; 5743 } 5744 5745 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS); 5746 } 5747 5748 static bool isConstOne(const SDValue &Operand) { 5749 const auto *Const = dyn_cast<ConstantSDNode>(Operand); 5750 return Const && Const->getZExtValue() == 1; 5751 } 5752 5753 static SDValue matchMADConstOnePattern(SDValue Add) { 5754 if (Add->getOpcode() != ISD::ADD) 5755 return SDValue(); 5756 5757 if (isConstOne(Add->getOperand(0))) 5758 return Add->getOperand(1); 5759 5760 if (isConstOne(Add->getOperand(1))) 5761 return Add->getOperand(0); 5762 5763 return SDValue(); 5764 } 5765 5766 static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, 5767 TargetLowering::DAGCombinerInfo &DCI) { 5768 5769 if (SDValue Y = matchMADConstOnePattern(Add)) 5770 return DCI.DAG.getNode(NVPTXISD::IMAD, DL, VT, X, Y, X); 5771 5772 return SDValue(); 5773 } 5774 5775 static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, 5776 SDLoc DL, 5777 TargetLowering::DAGCombinerInfo &DCI) { 5778 if (Select->getOpcode() != ISD::SELECT) 5779 return SDValue(); 5780 5781 SDValue Cond = Select->getOperand(0); 5782 5783 unsigned ConstOpNo; 5784 if (isConstOne(Select->getOperand(1))) 5785 ConstOpNo = 1; 5786 else if (isConstOne(Select->getOperand(2))) 5787 ConstOpNo = 2; 5788 else 5789 return SDValue(); 5790 5791 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1); 5792 5793 // Do not combine if the resulting sequence is not obviously profitable. 5794 if (!matchMADConstOnePattern(Y)) 5795 return SDValue(); 5796 5797 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y); 5798 5799 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond, 5800 (ConstOpNo == 1) ? X : NewMul, 5801 (ConstOpNo == 1) ? NewMul : X); 5802 } 5803 5804 static SDValue 5805 PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 5806 TargetLowering::DAGCombinerInfo &DCI) { 5807 5808 EVT VT = N0.getValueType(); 5809 if (VT.isVector()) 5810 return SDValue(); 5811 5812 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 5813 return SDValue(); 5814 5815 SDLoc DL(N); 5816 5817 // (mul x, (add y, 1)) -> (mad x, y, x) 5818 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI)) 5819 return Res; 5820 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI)) 5821 return Res; 5822 5823 // (mul x, (select y, 1)) -> (select (mul x, y), x) 5824 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI)) 5825 return Res; 5826 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI)) 5827 return Res; 5828 5829 return SDValue(); 5830 } 5831 5832 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5833 static SDValue PerformMULCombine(SDNode *N, 5834 TargetLowering::DAGCombinerInfo &DCI, 5835 CodeGenOptLevel OptLevel) { 5836 if (OptLevel == CodeGenOptLevel::None) 5837 return SDValue(); 5838 5839 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5840 return Ret; 5841 5842 SDValue N0 = N->getOperand(0); 5843 SDValue N1 = N->getOperand(1); 5844 return PerformMULCombineWithOperands(N, N0, N1, DCI); 5845 } 5846 5847 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5848 static SDValue PerformSHLCombine(SDNode *N, 5849 TargetLowering::DAGCombinerInfo &DCI, 5850 CodeGenOptLevel OptLevel) { 5851 if (OptLevel > CodeGenOptLevel::None) { 5852 // Try mul.wide combining at OptLevel > 0 5853 if (SDValue Ret = TryMULWIDECombine(N, DCI)) 5854 return Ret; 5855 } 5856 5857 return SDValue(); 5858 } 5859 5860 static SDValue PerformSETCCCombine(SDNode *N, 5861 TargetLowering::DAGCombinerInfo &DCI, 5862 unsigned int SmVersion) { 5863 EVT CCType = N->getValueType(0); 5864 SDValue A = N->getOperand(0); 5865 SDValue B = N->getOperand(1); 5866 5867 EVT AType = A.getValueType(); 5868 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) 5869 return SDValue(); 5870 5871 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) 5872 return SDValue(); 5873 5874 SDLoc DL(N); 5875 // setp.f16x2 returns two scalar predicates, which we need to 5876 // convert back to v2i1. The returned result will be scalarized by 5877 // the legalizer, but the comparison will remain a single vector 5878 // instruction. 5879 SDValue CCNode = DCI.DAG.getNode( 5880 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 5881 : NVPTXISD::SETP_BF16X2, 5882 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); 5883 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0), 5884 CCNode.getValue(1)); 5885 } 5886 5887 static SDValue PerformEXTRACTCombine(SDNode *N, 5888 TargetLowering::DAGCombinerInfo &DCI) { 5889 SDValue Vector = N->getOperand(0); 5890 SDLoc DL(N); 5891 EVT VectorVT = Vector.getValueType(); 5892 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && 5893 IsPTXVectorType(VectorVT.getSimpleVT())) 5894 return SDValue(); // Native vector loads already combine nicely w/ 5895 // extract_vector_elt. 5896 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already 5897 // handle them OK. 5898 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || 5899 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) 5900 return SDValue(); 5901 5902 // Don't mess with undef values as sra may be simplified to 0, not undef. 5903 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode())) 5904 return SDValue(); 5905 5906 uint64_t VectorBits = VectorVT.getSizeInBits(); 5907 // We only handle the types we can extract in-register. 5908 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) 5909 return SDValue(); 5910 5911 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); 5912 // Index == 0 is handled by generic DAG combiner. 5913 if (!Index || Index->getZExtValue() == 0) 5914 return SDValue(); 5915 5916 MVT IVT = MVT::getIntegerVT(VectorBits); 5917 EVT EltVT = VectorVT.getVectorElementType(); 5918 EVT EltIVT = EltVT.changeTypeToInteger(); 5919 uint64_t EltBits = EltVT.getScalarSizeInBits(); 5920 5921 SDValue Result = DCI.DAG.getNode( 5922 ISD::TRUNCATE, DL, EltIVT, 5923 DCI.DAG.getNode( 5924 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector), 5925 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); 5926 5927 // If element has non-integer type, bitcast it back to the expected type. 5928 if (EltVT != EltIVT) 5929 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result); 5930 // Past legalizer, we may need to extent i8 -> i16 to match the register type. 5931 if (EltVT != N->getValueType(0)) 5932 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); 5933 5934 return Result; 5935 } 5936 5937 static SDValue PerformVSELECTCombine(SDNode *N, 5938 TargetLowering::DAGCombinerInfo &DCI) { 5939 SDValue VA = N->getOperand(1); 5940 EVT VectorVT = VA.getValueType(); 5941 if (VectorVT != MVT::v4i8) 5942 return SDValue(); 5943 5944 // We need to split vselect into individual per-element operations Because we 5945 // use BFE/BFI instruction for byte extraction/insertion, we do end up with 5946 // 32-bit values, so we may as well do comparison as i32 to avoid conversions 5947 // to/from i16 normally used for i8 values. 5948 SmallVector<SDValue, 4> E; 5949 SDLoc DL(N); 5950 SDValue VCond = N->getOperand(0); 5951 SDValue VB = N->getOperand(2); 5952 for (int I = 0; I < 4; ++I) { 5953 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, 5954 DCI.DAG.getConstant(I, DL, MVT::i32)); 5955 SDValue EA = DCI.DAG.getAnyExtOrTrunc( 5956 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, 5957 DCI.DAG.getConstant(I, DL, MVT::i32)), 5958 DL, MVT::i32); 5959 SDValue EB = DCI.DAG.getAnyExtOrTrunc( 5960 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, 5961 DCI.DAG.getConstant(I, DL, MVT::i32)), 5962 DL, MVT::i32); 5963 E.push_back(DCI.DAG.getAnyExtOrTrunc( 5964 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); 5965 } 5966 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); 5967 } 5968 5969 static SDValue PerformLOADCombine(SDNode *N, 5970 TargetLowering::DAGCombinerInfo &DCI) { 5971 SelectionDAG &DAG = DCI.DAG; 5972 LoadSDNode *LD = cast<LoadSDNode>(N); 5973 5974 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of 5975 // letting ReplaceLoadVector split it into smaller loads during legalization. 5976 // This is done at dag-combine1 time, so that vector operations with i8 5977 // elements can be optimised away instead of being needlessly split during 5978 // legalization, which involves storing to the stack and loading it back. 5979 EVT VT = N->getValueType(0); 5980 if (VT != MVT::v16i8) 5981 return SDValue(); 5982 5983 SDLoc DL(N); 5984 5985 // Create a v4i32 vector load operation, effectively <4 x v4i8>. 5986 unsigned Opc = NVPTXISD::LoadV4; 5987 EVT NewVT = MVT::v4i32; 5988 EVT EltVT = NewVT.getVectorElementType(); 5989 unsigned NumElts = NewVT.getVectorNumElements(); 5990 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; 5991 SDVTList RetVTList = DAG.getVTList(RetVTs); 5992 SmallVector<SDValue, 8> Ops(N->ops()); 5993 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 5994 SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, 5995 LD->getMemOperand()); 5996 SDValue NewChain = NewLoad.getValue(NumElts); 5997 5998 // Create a vector of the same type returned by the original load. 5999 SmallVector<SDValue, 4> Elts; 6000 for (unsigned i = 0; i < NumElts; i++) 6001 Elts.push_back(NewLoad.getValue(i)); 6002 return DCI.DAG.getMergeValues( 6003 {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), 6004 NewChain}, 6005 DL); 6006 } 6007 6008 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, 6009 DAGCombinerInfo &DCI) const { 6010 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); 6011 switch (N->getOpcode()) { 6012 default: break; 6013 case ISD::ADD: 6014 return PerformADDCombine(N, DCI, OptLevel); 6015 case ISD::FADD: 6016 return PerformFADDCombine(N, DCI, OptLevel); 6017 case ISD::MUL: 6018 return PerformMULCombine(N, DCI, OptLevel); 6019 case ISD::SHL: 6020 return PerformSHLCombine(N, DCI, OptLevel); 6021 case ISD::AND: 6022 return PerformANDCombine(N, DCI); 6023 case ISD::UREM: 6024 case ISD::SREM: 6025 return PerformREMCombine(N, DCI, OptLevel); 6026 case ISD::SETCC: 6027 return PerformSETCCCombine(N, DCI, STI.getSmVersion()); 6028 case ISD::LOAD: 6029 return PerformLOADCombine(N, DCI); 6030 case NVPTXISD::StoreRetval: 6031 case NVPTXISD::StoreRetvalV2: 6032 case NVPTXISD::StoreRetvalV4: 6033 return PerformStoreRetvalCombine(N); 6034 case NVPTXISD::StoreParam: 6035 case NVPTXISD::StoreParamV2: 6036 case NVPTXISD::StoreParamV4: 6037 return PerformStoreParamCombine(N); 6038 case ISD::EXTRACT_VECTOR_ELT: 6039 return PerformEXTRACTCombine(N, DCI); 6040 case ISD::VSELECT: 6041 return PerformVSELECTCombine(N, DCI); 6042 } 6043 return SDValue(); 6044 } 6045 6046 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 6047 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, 6048 SmallVectorImpl<SDValue> &Results) { 6049 EVT ResVT = N->getValueType(0); 6050 SDLoc DL(N); 6051 6052 assert(ResVT.isVector() && "Vector load must have vector type"); 6053 6054 // We only handle "native" vector sizes for now, e.g. <4 x double> is not 6055 // legal. We can (and should) split that into 2 loads of <2 x double> here 6056 // but I'm leaving that as a TODO for now. 6057 assert(ResVT.isSimple() && "Can only handle simple types"); 6058 switch (ResVT.getSimpleVT().SimpleTy) { 6059 default: 6060 return; 6061 case MVT::v2i8: 6062 case MVT::v2i16: 6063 case MVT::v2i32: 6064 case MVT::v2i64: 6065 case MVT::v2f16: 6066 case MVT::v2f32: 6067 case MVT::v2f64: 6068 case MVT::v4i8: 6069 case MVT::v4i16: 6070 case MVT::v4i32: 6071 case MVT::v4f16: 6072 case MVT::v4f32: 6073 case MVT::v8f16: // <4 x f16x2> 6074 case MVT::v8bf16: // <4 x bf16x2> 6075 case MVT::v8i16: // <4 x i16x2> 6076 // This is a "native" vector type 6077 break; 6078 } 6079 6080 LoadSDNode *LD = cast<LoadSDNode>(N); 6081 6082 Align Alignment = LD->getAlign(); 6083 auto &TD = DAG.getDataLayout(); 6084 Align PrefAlign = 6085 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); 6086 if (Alignment < PrefAlign) { 6087 // This load is not sufficiently aligned, so bail out and let this vector 6088 // load be scalarized. Note that we may still be able to emit smaller 6089 // vector loads. For example, if we are loading a <4 x float> with an 6090 // alignment of 8, this check will fail but the legalizer will try again 6091 // with 2 x <2 x float>, which will succeed with an alignment of 8. 6092 return; 6093 } 6094 6095 EVT EltVT = ResVT.getVectorElementType(); 6096 unsigned NumElts = ResVT.getVectorNumElements(); 6097 6098 // Since LoadV2 is a target node, we cannot rely on DAG type legalization. 6099 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 6100 // loaded type to i16 and propagate the "real" type as the memory type. 6101 bool NeedTrunc = false; 6102 if (EltVT.getSizeInBits() < 16) { 6103 EltVT = MVT::i16; 6104 NeedTrunc = true; 6105 } 6106 6107 unsigned Opcode = 0; 6108 SDVTList LdResVTs; 6109 bool Load16x2 = false; 6110 6111 switch (NumElts) { 6112 default: 6113 return; 6114 case 2: 6115 Opcode = NVPTXISD::LoadV2; 6116 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 6117 break; 6118 case 4: { 6119 Opcode = NVPTXISD::LoadV4; 6120 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 6121 LdResVTs = DAG.getVTList(ListVTs); 6122 break; 6123 } 6124 case 8: { 6125 // v8f16 is a special case. PTX doesn't have ld.v8.f16 6126 // instruction. Instead, we split the vector into v2f16 chunks and 6127 // load them with ld.v4.b32. 6128 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); 6129 Load16x2 = true; 6130 Opcode = NVPTXISD::LoadV4; 6131 EVT VVT; 6132 switch (EltVT.getSimpleVT().SimpleTy) { 6133 case MVT::f16: 6134 VVT = MVT::v2f16; 6135 break; 6136 case MVT::bf16: 6137 VVT = MVT::v2bf16; 6138 break; 6139 case MVT::i16: 6140 VVT = MVT::v2i16; 6141 break; 6142 default: 6143 llvm_unreachable("Unsupported v8 vector type."); 6144 } 6145 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; 6146 LdResVTs = DAG.getVTList(ListVTs); 6147 break; 6148 } 6149 } 6150 6151 // Copy regular operands 6152 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); 6153 6154 // The select routine does not have access to the LoadSDNode instance, so 6155 // pass along the extension information 6156 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); 6157 6158 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 6159 LD->getMemoryVT(), 6160 LD->getMemOperand()); 6161 6162 SmallVector<SDValue, 8> ScalarRes; 6163 if (Load16x2) { 6164 // Split v2f16 subvectors back into individual elements. 6165 NumElts /= 2; 6166 for (unsigned i = 0; i < NumElts; ++i) { 6167 SDValue SubVector = NewLD.getValue(i); 6168 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 6169 DAG.getIntPtrConstant(0, DL)); 6170 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, 6171 DAG.getIntPtrConstant(1, DL)); 6172 ScalarRes.push_back(E0); 6173 ScalarRes.push_back(E1); 6174 } 6175 } else { 6176 for (unsigned i = 0; i < NumElts; ++i) { 6177 SDValue Res = NewLD.getValue(i); 6178 if (NeedTrunc) 6179 Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 6180 ScalarRes.push_back(Res); 6181 } 6182 } 6183 6184 SDValue LoadChain = NewLD.getValue(NumElts); 6185 6186 SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes); 6187 6188 Results.push_back(BuildVec); 6189 Results.push_back(LoadChain); 6190 } 6191 6192 static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, 6193 SmallVectorImpl<SDValue> &Results) { 6194 SDValue Chain = N->getOperand(0); 6195 SDValue Intrin = N->getOperand(1); 6196 SDLoc DL(N); 6197 6198 // Get the intrinsic ID 6199 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); 6200 switch (IntrinNo) { 6201 default: 6202 return; 6203 case Intrinsic::nvvm_ldg_global_i: 6204 case Intrinsic::nvvm_ldg_global_f: 6205 case Intrinsic::nvvm_ldg_global_p: 6206 case Intrinsic::nvvm_ldu_global_i: 6207 case Intrinsic::nvvm_ldu_global_f: 6208 case Intrinsic::nvvm_ldu_global_p: { 6209 EVT ResVT = N->getValueType(0); 6210 6211 if (ResVT.isVector()) { 6212 // Vector LDG/LDU 6213 6214 unsigned NumElts = ResVT.getVectorNumElements(); 6215 EVT EltVT = ResVT.getVectorElementType(); 6216 6217 // Since LDU/LDG are target nodes, we cannot rely on DAG type 6218 // legalization. 6219 // Therefore, we must ensure the type is legal. For i1 and i8, we set the 6220 // loaded type to i16 and propagate the "real" type as the memory type. 6221 bool NeedTrunc = false; 6222 if (EltVT.getSizeInBits() < 16) { 6223 EltVT = MVT::i16; 6224 NeedTrunc = true; 6225 } 6226 6227 unsigned Opcode = 0; 6228 SDVTList LdResVTs; 6229 6230 switch (NumElts) { 6231 default: 6232 return; 6233 case 2: 6234 switch (IntrinNo) { 6235 default: 6236 return; 6237 case Intrinsic::nvvm_ldg_global_i: 6238 case Intrinsic::nvvm_ldg_global_f: 6239 case Intrinsic::nvvm_ldg_global_p: 6240 Opcode = NVPTXISD::LDGV2; 6241 break; 6242 case Intrinsic::nvvm_ldu_global_i: 6243 case Intrinsic::nvvm_ldu_global_f: 6244 case Intrinsic::nvvm_ldu_global_p: 6245 Opcode = NVPTXISD::LDUV2; 6246 break; 6247 } 6248 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); 6249 break; 6250 case 4: { 6251 switch (IntrinNo) { 6252 default: 6253 return; 6254 case Intrinsic::nvvm_ldg_global_i: 6255 case Intrinsic::nvvm_ldg_global_f: 6256 case Intrinsic::nvvm_ldg_global_p: 6257 Opcode = NVPTXISD::LDGV4; 6258 break; 6259 case Intrinsic::nvvm_ldu_global_i: 6260 case Intrinsic::nvvm_ldu_global_f: 6261 case Intrinsic::nvvm_ldu_global_p: 6262 Opcode = NVPTXISD::LDUV4; 6263 break; 6264 } 6265 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; 6266 LdResVTs = DAG.getVTList(ListVTs); 6267 break; 6268 } 6269 } 6270 6271 SmallVector<SDValue, 8> OtherOps; 6272 6273 // Copy regular operands 6274 6275 OtherOps.push_back(Chain); // Chain 6276 // Skip operand 1 (intrinsic ID) 6277 // Others 6278 OtherOps.append(N->op_begin() + 2, N->op_end()); 6279 6280 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 6281 6282 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, 6283 MemSD->getMemoryVT(), 6284 MemSD->getMemOperand()); 6285 6286 SmallVector<SDValue, 4> ScalarRes; 6287 6288 for (unsigned i = 0; i < NumElts; ++i) { 6289 SDValue Res = NewLD.getValue(i); 6290 if (NeedTrunc) 6291 Res = 6292 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res); 6293 ScalarRes.push_back(Res); 6294 } 6295 6296 SDValue LoadChain = NewLD.getValue(NumElts); 6297 6298 SDValue BuildVec = 6299 DAG.getBuildVector(ResVT, DL, ScalarRes); 6300 6301 Results.push_back(BuildVec); 6302 Results.push_back(LoadChain); 6303 } else { 6304 // i8 LDG/LDU 6305 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && 6306 "Custom handling of non-i8 ldu/ldg?"); 6307 6308 // Just copy all operands as-is 6309 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); 6310 6311 // Force output to i16 6312 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); 6313 6314 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N); 6315 6316 // We make sure the memory type is i8, which will be used during isel 6317 // to select the proper instruction. 6318 SDValue NewLD = 6319 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, 6320 MVT::i8, MemSD->getMemOperand()); 6321 6322 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, 6323 NewLD.getValue(0))); 6324 Results.push_back(NewLD.getValue(1)); 6325 } 6326 } 6327 } 6328 } 6329 6330 static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, 6331 SmallVectorImpl<SDValue> &Results) { 6332 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit 6333 // result so that it can pass the legalization 6334 SDLoc DL(N); 6335 SDValue Chain = N->getOperand(0); 6336 SDValue Reg = N->getOperand(1); 6337 SDValue Glue = N->getOperand(2); 6338 6339 assert(Reg.getValueType() == MVT::i128 && 6340 "Custom lowering for CopyFromReg with 128-bit reg only"); 6341 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1), 6342 N->getValueType(2)}; 6343 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue}; 6344 6345 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps); 6346 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 6347 {NewValue.getValue(0), NewValue.getValue(1)}); 6348 6349 Results.push_back(Pair); 6350 Results.push_back(NewValue.getValue(2)); 6351 Results.push_back(NewValue.getValue(3)); 6352 } 6353 6354 void NVPTXTargetLowering::ReplaceNodeResults( 6355 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 6356 switch (N->getOpcode()) { 6357 default: 6358 report_fatal_error("Unhandled custom legalization"); 6359 case ISD::LOAD: 6360 ReplaceLoadVector(N, DAG, Results); 6361 return; 6362 case ISD::INTRINSIC_W_CHAIN: 6363 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); 6364 return; 6365 case ISD::CopyFromReg: 6366 ReplaceCopyFromReg_128(N, DAG, Results); 6367 return; 6368 } 6369 } 6370 6371 NVPTXTargetLowering::AtomicExpansionKind 6372 NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 6373 Type *Ty = AI->getValOperand()->getType(); 6374 6375 if (AI->isFloatingPointOperation()) { 6376 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { 6377 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 && 6378 STI.getPTXVersion() >= 63) 6379 return AtomicExpansionKind::None; 6380 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 && 6381 STI.getPTXVersion() >= 78) 6382 return AtomicExpansionKind::None; 6383 if (Ty->isFloatTy()) 6384 return AtomicExpansionKind::None; 6385 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) 6386 return AtomicExpansionKind::None; 6387 } 6388 return AtomicExpansionKind::CmpXChg; 6389 } 6390 6391 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); 6392 auto ITy = cast<llvm::IntegerType>(Ty); 6393 6394 switch (AI->getOperation()) { 6395 default: 6396 return AtomicExpansionKind::CmpXChg; 6397 case AtomicRMWInst::BinOp::And: 6398 case AtomicRMWInst::BinOp::Or: 6399 case AtomicRMWInst::BinOp::Xor: 6400 case AtomicRMWInst::BinOp::Xchg: 6401 switch (ITy->getBitWidth()) { 6402 case 8: 6403 case 16: 6404 return AtomicExpansionKind::CmpXChg; 6405 case 32: 6406 return AtomicExpansionKind::None; 6407 case 64: 6408 if (STI.hasAtomBitwise64()) 6409 return AtomicExpansionKind::None; 6410 return AtomicExpansionKind::CmpXChg; 6411 default: 6412 llvm_unreachable("unsupported width encountered"); 6413 } 6414 case AtomicRMWInst::BinOp::Add: 6415 case AtomicRMWInst::BinOp::Sub: 6416 case AtomicRMWInst::BinOp::Max: 6417 case AtomicRMWInst::BinOp::Min: 6418 case AtomicRMWInst::BinOp::UMax: 6419 case AtomicRMWInst::BinOp::UMin: 6420 switch (ITy->getBitWidth()) { 6421 case 8: 6422 case 16: 6423 return AtomicExpansionKind::CmpXChg; 6424 case 32: 6425 return AtomicExpansionKind::None; 6426 case 64: 6427 if (STI.hasAtomMinMax64()) 6428 return AtomicExpansionKind::None; 6429 return AtomicExpansionKind::CmpXChg; 6430 default: 6431 llvm_unreachable("unsupported width encountered"); 6432 } 6433 } 6434 6435 return AtomicExpansionKind::CmpXChg; 6436 } 6437 6438 // Pin NVPTXTargetObjectFile's vtables to this file. 6439 NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; 6440 6441 MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( 6442 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { 6443 return getDataSection(); 6444 } 6445