1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines an instruction selector for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXISelDAGToDAG.h" 14 #include "MCTargetDesc/NVPTXBaseInfo.h" 15 #include "NVPTXUtilities.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/CodeGen/ISDOpcodes.h" 18 #include "llvm/IR/GlobalValue.h" 19 #include "llvm/IR/Instructions.h" 20 #include "llvm/IR/IntrinsicsNVPTX.h" 21 #include "llvm/Support/AtomicOrdering.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include "llvm/Target/TargetIntrinsicInfo.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "nvptx-isel" 31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" 32 33 /// createNVPTXISelDag - This pass converts a legalized DAG into a 34 /// NVPTX-specific DAG, ready for instruction scheduling. 35 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, 36 llvm::CodeGenOptLevel OptLevel) { 37 return new NVPTXDAGToDAGISel(TM, OptLevel); 38 } 39 40 char NVPTXDAGToDAGISel::ID = 0; 41 42 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) 43 44 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, 45 CodeGenOptLevel OptLevel) 46 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) { 47 doMulWide = (OptLevel > CodeGenOptLevel::None); 48 } 49 50 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 51 Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); 52 return SelectionDAGISel::runOnMachineFunction(MF); 53 } 54 55 int NVPTXDAGToDAGISel::getDivF32Level() const { 56 return Subtarget->getTargetLowering()->getDivF32Level(); 57 } 58 59 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { 60 return Subtarget->getTargetLowering()->usePrecSqrtF32(); 61 } 62 63 bool NVPTXDAGToDAGISel::useF32FTZ() const { 64 return Subtarget->getTargetLowering()->useF32FTZ(*MF); 65 } 66 67 bool NVPTXDAGToDAGISel::allowFMA() const { 68 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 69 return TL->allowFMA(*MF, OptLevel); 70 } 71 72 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { 73 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 74 return TL->allowUnsafeFPMath(*MF); 75 } 76 77 bool NVPTXDAGToDAGISel::useShortPointers() const { 78 return TM.useShortPointers(); 79 } 80 81 /// Select - Select instructions not customized! Used for 82 /// expanded, promoted and normal instructions. 83 void NVPTXDAGToDAGISel::Select(SDNode *N) { 84 85 if (N->isMachineOpcode()) { 86 N->setNodeId(-1); 87 return; // Already selected. 88 } 89 90 switch (N->getOpcode()) { 91 case ISD::LOAD: 92 case ISD::ATOMIC_LOAD: 93 if (tryLoad(N)) 94 return; 95 break; 96 case ISD::STORE: 97 case ISD::ATOMIC_STORE: 98 if (tryStore(N)) 99 return; 100 break; 101 case ISD::EXTRACT_VECTOR_ELT: 102 if (tryEXTRACT_VECTOR_ELEMENT(N)) 103 return; 104 break; 105 case NVPTXISD::SETP_F16X2: 106 SelectSETP_F16X2(N); 107 return; 108 case NVPTXISD::SETP_BF16X2: 109 SelectSETP_BF16X2(N); 110 return; 111 case NVPTXISD::LoadV2: 112 case NVPTXISD::LoadV4: 113 if (tryLoadVector(N)) 114 return; 115 break; 116 case NVPTXISD::LDGV2: 117 case NVPTXISD::LDGV4: 118 case NVPTXISD::LDUV2: 119 case NVPTXISD::LDUV4: 120 if (tryLDGLDU(N)) 121 return; 122 break; 123 case NVPTXISD::StoreV2: 124 case NVPTXISD::StoreV4: 125 if (tryStoreVector(N)) 126 return; 127 break; 128 case NVPTXISD::LoadParam: 129 case NVPTXISD::LoadParamV2: 130 case NVPTXISD::LoadParamV4: 131 if (tryLoadParam(N)) 132 return; 133 break; 134 case NVPTXISD::StoreRetval: 135 case NVPTXISD::StoreRetvalV2: 136 case NVPTXISD::StoreRetvalV4: 137 if (tryStoreRetval(N)) 138 return; 139 break; 140 case NVPTXISD::StoreParam: 141 case NVPTXISD::StoreParamV2: 142 case NVPTXISD::StoreParamV4: 143 case NVPTXISD::StoreParamS32: 144 case NVPTXISD::StoreParamU32: 145 if (tryStoreParam(N)) 146 return; 147 break; 148 case ISD::INTRINSIC_WO_CHAIN: 149 if (tryIntrinsicNoChain(N)) 150 return; 151 break; 152 case ISD::INTRINSIC_W_CHAIN: 153 if (tryIntrinsicChain(N)) 154 return; 155 break; 156 case NVPTXISD::Tex1DFloatS32: 157 case NVPTXISD::Tex1DFloatFloat: 158 case NVPTXISD::Tex1DFloatFloatLevel: 159 case NVPTXISD::Tex1DFloatFloatGrad: 160 case NVPTXISD::Tex1DS32S32: 161 case NVPTXISD::Tex1DS32Float: 162 case NVPTXISD::Tex1DS32FloatLevel: 163 case NVPTXISD::Tex1DS32FloatGrad: 164 case NVPTXISD::Tex1DU32S32: 165 case NVPTXISD::Tex1DU32Float: 166 case NVPTXISD::Tex1DU32FloatLevel: 167 case NVPTXISD::Tex1DU32FloatGrad: 168 case NVPTXISD::Tex1DArrayFloatS32: 169 case NVPTXISD::Tex1DArrayFloatFloat: 170 case NVPTXISD::Tex1DArrayFloatFloatLevel: 171 case NVPTXISD::Tex1DArrayFloatFloatGrad: 172 case NVPTXISD::Tex1DArrayS32S32: 173 case NVPTXISD::Tex1DArrayS32Float: 174 case NVPTXISD::Tex1DArrayS32FloatLevel: 175 case NVPTXISD::Tex1DArrayS32FloatGrad: 176 case NVPTXISD::Tex1DArrayU32S32: 177 case NVPTXISD::Tex1DArrayU32Float: 178 case NVPTXISD::Tex1DArrayU32FloatLevel: 179 case NVPTXISD::Tex1DArrayU32FloatGrad: 180 case NVPTXISD::Tex2DFloatS32: 181 case NVPTXISD::Tex2DFloatFloat: 182 case NVPTXISD::Tex2DFloatFloatLevel: 183 case NVPTXISD::Tex2DFloatFloatGrad: 184 case NVPTXISD::Tex2DS32S32: 185 case NVPTXISD::Tex2DS32Float: 186 case NVPTXISD::Tex2DS32FloatLevel: 187 case NVPTXISD::Tex2DS32FloatGrad: 188 case NVPTXISD::Tex2DU32S32: 189 case NVPTXISD::Tex2DU32Float: 190 case NVPTXISD::Tex2DU32FloatLevel: 191 case NVPTXISD::Tex2DU32FloatGrad: 192 case NVPTXISD::Tex2DArrayFloatS32: 193 case NVPTXISD::Tex2DArrayFloatFloat: 194 case NVPTXISD::Tex2DArrayFloatFloatLevel: 195 case NVPTXISD::Tex2DArrayFloatFloatGrad: 196 case NVPTXISD::Tex2DArrayS32S32: 197 case NVPTXISD::Tex2DArrayS32Float: 198 case NVPTXISD::Tex2DArrayS32FloatLevel: 199 case NVPTXISD::Tex2DArrayS32FloatGrad: 200 case NVPTXISD::Tex2DArrayU32S32: 201 case NVPTXISD::Tex2DArrayU32Float: 202 case NVPTXISD::Tex2DArrayU32FloatLevel: 203 case NVPTXISD::Tex2DArrayU32FloatGrad: 204 case NVPTXISD::Tex3DFloatS32: 205 case NVPTXISD::Tex3DFloatFloat: 206 case NVPTXISD::Tex3DFloatFloatLevel: 207 case NVPTXISD::Tex3DFloatFloatGrad: 208 case NVPTXISD::Tex3DS32S32: 209 case NVPTXISD::Tex3DS32Float: 210 case NVPTXISD::Tex3DS32FloatLevel: 211 case NVPTXISD::Tex3DS32FloatGrad: 212 case NVPTXISD::Tex3DU32S32: 213 case NVPTXISD::Tex3DU32Float: 214 case NVPTXISD::Tex3DU32FloatLevel: 215 case NVPTXISD::Tex3DU32FloatGrad: 216 case NVPTXISD::TexCubeFloatFloat: 217 case NVPTXISD::TexCubeFloatFloatLevel: 218 case NVPTXISD::TexCubeS32Float: 219 case NVPTXISD::TexCubeS32FloatLevel: 220 case NVPTXISD::TexCubeU32Float: 221 case NVPTXISD::TexCubeU32FloatLevel: 222 case NVPTXISD::TexCubeArrayFloatFloat: 223 case NVPTXISD::TexCubeArrayFloatFloatLevel: 224 case NVPTXISD::TexCubeArrayS32Float: 225 case NVPTXISD::TexCubeArrayS32FloatLevel: 226 case NVPTXISD::TexCubeArrayU32Float: 227 case NVPTXISD::TexCubeArrayU32FloatLevel: 228 case NVPTXISD::Tld4R2DFloatFloat: 229 case NVPTXISD::Tld4G2DFloatFloat: 230 case NVPTXISD::Tld4B2DFloatFloat: 231 case NVPTXISD::Tld4A2DFloatFloat: 232 case NVPTXISD::Tld4R2DS64Float: 233 case NVPTXISD::Tld4G2DS64Float: 234 case NVPTXISD::Tld4B2DS64Float: 235 case NVPTXISD::Tld4A2DS64Float: 236 case NVPTXISD::Tld4R2DU64Float: 237 case NVPTXISD::Tld4G2DU64Float: 238 case NVPTXISD::Tld4B2DU64Float: 239 case NVPTXISD::Tld4A2DU64Float: 240 case NVPTXISD::TexUnified1DFloatS32: 241 case NVPTXISD::TexUnified1DFloatFloat: 242 case NVPTXISD::TexUnified1DFloatFloatLevel: 243 case NVPTXISD::TexUnified1DFloatFloatGrad: 244 case NVPTXISD::TexUnified1DS32S32: 245 case NVPTXISD::TexUnified1DS32Float: 246 case NVPTXISD::TexUnified1DS32FloatLevel: 247 case NVPTXISD::TexUnified1DS32FloatGrad: 248 case NVPTXISD::TexUnified1DU32S32: 249 case NVPTXISD::TexUnified1DU32Float: 250 case NVPTXISD::TexUnified1DU32FloatLevel: 251 case NVPTXISD::TexUnified1DU32FloatGrad: 252 case NVPTXISD::TexUnified1DArrayFloatS32: 253 case NVPTXISD::TexUnified1DArrayFloatFloat: 254 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 255 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 256 case NVPTXISD::TexUnified1DArrayS32S32: 257 case NVPTXISD::TexUnified1DArrayS32Float: 258 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 259 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 260 case NVPTXISD::TexUnified1DArrayU32S32: 261 case NVPTXISD::TexUnified1DArrayU32Float: 262 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 263 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 264 case NVPTXISD::TexUnified2DFloatS32: 265 case NVPTXISD::TexUnified2DFloatFloat: 266 case NVPTXISD::TexUnified2DFloatFloatLevel: 267 case NVPTXISD::TexUnified2DFloatFloatGrad: 268 case NVPTXISD::TexUnified2DS32S32: 269 case NVPTXISD::TexUnified2DS32Float: 270 case NVPTXISD::TexUnified2DS32FloatLevel: 271 case NVPTXISD::TexUnified2DS32FloatGrad: 272 case NVPTXISD::TexUnified2DU32S32: 273 case NVPTXISD::TexUnified2DU32Float: 274 case NVPTXISD::TexUnified2DU32FloatLevel: 275 case NVPTXISD::TexUnified2DU32FloatGrad: 276 case NVPTXISD::TexUnified2DArrayFloatS32: 277 case NVPTXISD::TexUnified2DArrayFloatFloat: 278 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 279 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 280 case NVPTXISD::TexUnified2DArrayS32S32: 281 case NVPTXISD::TexUnified2DArrayS32Float: 282 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 283 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 284 case NVPTXISD::TexUnified2DArrayU32S32: 285 case NVPTXISD::TexUnified2DArrayU32Float: 286 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 287 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 288 case NVPTXISD::TexUnified3DFloatS32: 289 case NVPTXISD::TexUnified3DFloatFloat: 290 case NVPTXISD::TexUnified3DFloatFloatLevel: 291 case NVPTXISD::TexUnified3DFloatFloatGrad: 292 case NVPTXISD::TexUnified3DS32S32: 293 case NVPTXISD::TexUnified3DS32Float: 294 case NVPTXISD::TexUnified3DS32FloatLevel: 295 case NVPTXISD::TexUnified3DS32FloatGrad: 296 case NVPTXISD::TexUnified3DU32S32: 297 case NVPTXISD::TexUnified3DU32Float: 298 case NVPTXISD::TexUnified3DU32FloatLevel: 299 case NVPTXISD::TexUnified3DU32FloatGrad: 300 case NVPTXISD::TexUnifiedCubeFloatFloat: 301 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 302 case NVPTXISD::TexUnifiedCubeS32Float: 303 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 304 case NVPTXISD::TexUnifiedCubeU32Float: 305 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 306 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 307 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 308 case NVPTXISD::TexUnifiedCubeArrayS32Float: 309 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 310 case NVPTXISD::TexUnifiedCubeArrayU32Float: 311 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 312 case NVPTXISD::TexUnifiedCubeFloatFloatGrad: 313 case NVPTXISD::TexUnifiedCubeS32FloatGrad: 314 case NVPTXISD::TexUnifiedCubeU32FloatGrad: 315 case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: 316 case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: 317 case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: 318 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 319 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 320 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 321 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 322 case NVPTXISD::Tld4UnifiedR2DS64Float: 323 case NVPTXISD::Tld4UnifiedG2DS64Float: 324 case NVPTXISD::Tld4UnifiedB2DS64Float: 325 case NVPTXISD::Tld4UnifiedA2DS64Float: 326 case NVPTXISD::Tld4UnifiedR2DU64Float: 327 case NVPTXISD::Tld4UnifiedG2DU64Float: 328 case NVPTXISD::Tld4UnifiedB2DU64Float: 329 case NVPTXISD::Tld4UnifiedA2DU64Float: 330 if (tryTextureIntrinsic(N)) 331 return; 332 break; 333 case NVPTXISD::Suld1DI8Clamp: 334 case NVPTXISD::Suld1DI16Clamp: 335 case NVPTXISD::Suld1DI32Clamp: 336 case NVPTXISD::Suld1DI64Clamp: 337 case NVPTXISD::Suld1DV2I8Clamp: 338 case NVPTXISD::Suld1DV2I16Clamp: 339 case NVPTXISD::Suld1DV2I32Clamp: 340 case NVPTXISD::Suld1DV2I64Clamp: 341 case NVPTXISD::Suld1DV4I8Clamp: 342 case NVPTXISD::Suld1DV4I16Clamp: 343 case NVPTXISD::Suld1DV4I32Clamp: 344 case NVPTXISD::Suld1DArrayI8Clamp: 345 case NVPTXISD::Suld1DArrayI16Clamp: 346 case NVPTXISD::Suld1DArrayI32Clamp: 347 case NVPTXISD::Suld1DArrayI64Clamp: 348 case NVPTXISD::Suld1DArrayV2I8Clamp: 349 case NVPTXISD::Suld1DArrayV2I16Clamp: 350 case NVPTXISD::Suld1DArrayV2I32Clamp: 351 case NVPTXISD::Suld1DArrayV2I64Clamp: 352 case NVPTXISD::Suld1DArrayV4I8Clamp: 353 case NVPTXISD::Suld1DArrayV4I16Clamp: 354 case NVPTXISD::Suld1DArrayV4I32Clamp: 355 case NVPTXISD::Suld2DI8Clamp: 356 case NVPTXISD::Suld2DI16Clamp: 357 case NVPTXISD::Suld2DI32Clamp: 358 case NVPTXISD::Suld2DI64Clamp: 359 case NVPTXISD::Suld2DV2I8Clamp: 360 case NVPTXISD::Suld2DV2I16Clamp: 361 case NVPTXISD::Suld2DV2I32Clamp: 362 case NVPTXISD::Suld2DV2I64Clamp: 363 case NVPTXISD::Suld2DV4I8Clamp: 364 case NVPTXISD::Suld2DV4I16Clamp: 365 case NVPTXISD::Suld2DV4I32Clamp: 366 case NVPTXISD::Suld2DArrayI8Clamp: 367 case NVPTXISD::Suld2DArrayI16Clamp: 368 case NVPTXISD::Suld2DArrayI32Clamp: 369 case NVPTXISD::Suld2DArrayI64Clamp: 370 case NVPTXISD::Suld2DArrayV2I8Clamp: 371 case NVPTXISD::Suld2DArrayV2I16Clamp: 372 case NVPTXISD::Suld2DArrayV2I32Clamp: 373 case NVPTXISD::Suld2DArrayV2I64Clamp: 374 case NVPTXISD::Suld2DArrayV4I8Clamp: 375 case NVPTXISD::Suld2DArrayV4I16Clamp: 376 case NVPTXISD::Suld2DArrayV4I32Clamp: 377 case NVPTXISD::Suld3DI8Clamp: 378 case NVPTXISD::Suld3DI16Clamp: 379 case NVPTXISD::Suld3DI32Clamp: 380 case NVPTXISD::Suld3DI64Clamp: 381 case NVPTXISD::Suld3DV2I8Clamp: 382 case NVPTXISD::Suld3DV2I16Clamp: 383 case NVPTXISD::Suld3DV2I32Clamp: 384 case NVPTXISD::Suld3DV2I64Clamp: 385 case NVPTXISD::Suld3DV4I8Clamp: 386 case NVPTXISD::Suld3DV4I16Clamp: 387 case NVPTXISD::Suld3DV4I32Clamp: 388 case NVPTXISD::Suld1DI8Trap: 389 case NVPTXISD::Suld1DI16Trap: 390 case NVPTXISD::Suld1DI32Trap: 391 case NVPTXISD::Suld1DI64Trap: 392 case NVPTXISD::Suld1DV2I8Trap: 393 case NVPTXISD::Suld1DV2I16Trap: 394 case NVPTXISD::Suld1DV2I32Trap: 395 case NVPTXISD::Suld1DV2I64Trap: 396 case NVPTXISD::Suld1DV4I8Trap: 397 case NVPTXISD::Suld1DV4I16Trap: 398 case NVPTXISD::Suld1DV4I32Trap: 399 case NVPTXISD::Suld1DArrayI8Trap: 400 case NVPTXISD::Suld1DArrayI16Trap: 401 case NVPTXISD::Suld1DArrayI32Trap: 402 case NVPTXISD::Suld1DArrayI64Trap: 403 case NVPTXISD::Suld1DArrayV2I8Trap: 404 case NVPTXISD::Suld1DArrayV2I16Trap: 405 case NVPTXISD::Suld1DArrayV2I32Trap: 406 case NVPTXISD::Suld1DArrayV2I64Trap: 407 case NVPTXISD::Suld1DArrayV4I8Trap: 408 case NVPTXISD::Suld1DArrayV4I16Trap: 409 case NVPTXISD::Suld1DArrayV4I32Trap: 410 case NVPTXISD::Suld2DI8Trap: 411 case NVPTXISD::Suld2DI16Trap: 412 case NVPTXISD::Suld2DI32Trap: 413 case NVPTXISD::Suld2DI64Trap: 414 case NVPTXISD::Suld2DV2I8Trap: 415 case NVPTXISD::Suld2DV2I16Trap: 416 case NVPTXISD::Suld2DV2I32Trap: 417 case NVPTXISD::Suld2DV2I64Trap: 418 case NVPTXISD::Suld2DV4I8Trap: 419 case NVPTXISD::Suld2DV4I16Trap: 420 case NVPTXISD::Suld2DV4I32Trap: 421 case NVPTXISD::Suld2DArrayI8Trap: 422 case NVPTXISD::Suld2DArrayI16Trap: 423 case NVPTXISD::Suld2DArrayI32Trap: 424 case NVPTXISD::Suld2DArrayI64Trap: 425 case NVPTXISD::Suld2DArrayV2I8Trap: 426 case NVPTXISD::Suld2DArrayV2I16Trap: 427 case NVPTXISD::Suld2DArrayV2I32Trap: 428 case NVPTXISD::Suld2DArrayV2I64Trap: 429 case NVPTXISD::Suld2DArrayV4I8Trap: 430 case NVPTXISD::Suld2DArrayV4I16Trap: 431 case NVPTXISD::Suld2DArrayV4I32Trap: 432 case NVPTXISD::Suld3DI8Trap: 433 case NVPTXISD::Suld3DI16Trap: 434 case NVPTXISD::Suld3DI32Trap: 435 case NVPTXISD::Suld3DI64Trap: 436 case NVPTXISD::Suld3DV2I8Trap: 437 case NVPTXISD::Suld3DV2I16Trap: 438 case NVPTXISD::Suld3DV2I32Trap: 439 case NVPTXISD::Suld3DV2I64Trap: 440 case NVPTXISD::Suld3DV4I8Trap: 441 case NVPTXISD::Suld3DV4I16Trap: 442 case NVPTXISD::Suld3DV4I32Trap: 443 case NVPTXISD::Suld1DI8Zero: 444 case NVPTXISD::Suld1DI16Zero: 445 case NVPTXISD::Suld1DI32Zero: 446 case NVPTXISD::Suld1DI64Zero: 447 case NVPTXISD::Suld1DV2I8Zero: 448 case NVPTXISD::Suld1DV2I16Zero: 449 case NVPTXISD::Suld1DV2I32Zero: 450 case NVPTXISD::Suld1DV2I64Zero: 451 case NVPTXISD::Suld1DV4I8Zero: 452 case NVPTXISD::Suld1DV4I16Zero: 453 case NVPTXISD::Suld1DV4I32Zero: 454 case NVPTXISD::Suld1DArrayI8Zero: 455 case NVPTXISD::Suld1DArrayI16Zero: 456 case NVPTXISD::Suld1DArrayI32Zero: 457 case NVPTXISD::Suld1DArrayI64Zero: 458 case NVPTXISD::Suld1DArrayV2I8Zero: 459 case NVPTXISD::Suld1DArrayV2I16Zero: 460 case NVPTXISD::Suld1DArrayV2I32Zero: 461 case NVPTXISD::Suld1DArrayV2I64Zero: 462 case NVPTXISD::Suld1DArrayV4I8Zero: 463 case NVPTXISD::Suld1DArrayV4I16Zero: 464 case NVPTXISD::Suld1DArrayV4I32Zero: 465 case NVPTXISD::Suld2DI8Zero: 466 case NVPTXISD::Suld2DI16Zero: 467 case NVPTXISD::Suld2DI32Zero: 468 case NVPTXISD::Suld2DI64Zero: 469 case NVPTXISD::Suld2DV2I8Zero: 470 case NVPTXISD::Suld2DV2I16Zero: 471 case NVPTXISD::Suld2DV2I32Zero: 472 case NVPTXISD::Suld2DV2I64Zero: 473 case NVPTXISD::Suld2DV4I8Zero: 474 case NVPTXISD::Suld2DV4I16Zero: 475 case NVPTXISD::Suld2DV4I32Zero: 476 case NVPTXISD::Suld2DArrayI8Zero: 477 case NVPTXISD::Suld2DArrayI16Zero: 478 case NVPTXISD::Suld2DArrayI32Zero: 479 case NVPTXISD::Suld2DArrayI64Zero: 480 case NVPTXISD::Suld2DArrayV2I8Zero: 481 case NVPTXISD::Suld2DArrayV2I16Zero: 482 case NVPTXISD::Suld2DArrayV2I32Zero: 483 case NVPTXISD::Suld2DArrayV2I64Zero: 484 case NVPTXISD::Suld2DArrayV4I8Zero: 485 case NVPTXISD::Suld2DArrayV4I16Zero: 486 case NVPTXISD::Suld2DArrayV4I32Zero: 487 case NVPTXISD::Suld3DI8Zero: 488 case NVPTXISD::Suld3DI16Zero: 489 case NVPTXISD::Suld3DI32Zero: 490 case NVPTXISD::Suld3DI64Zero: 491 case NVPTXISD::Suld3DV2I8Zero: 492 case NVPTXISD::Suld3DV2I16Zero: 493 case NVPTXISD::Suld3DV2I32Zero: 494 case NVPTXISD::Suld3DV2I64Zero: 495 case NVPTXISD::Suld3DV4I8Zero: 496 case NVPTXISD::Suld3DV4I16Zero: 497 case NVPTXISD::Suld3DV4I32Zero: 498 if (trySurfaceIntrinsic(N)) 499 return; 500 break; 501 case ISD::AND: 502 case ISD::SRA: 503 case ISD::SRL: 504 // Try to select BFE 505 if (tryBFE(N)) 506 return; 507 break; 508 case ISD::ADDRSPACECAST: 509 SelectAddrSpaceCast(N); 510 return; 511 case ISD::ConstantFP: 512 if (tryConstantFP(N)) 513 return; 514 break; 515 default: 516 break; 517 } 518 SelectCode(N); 519 } 520 521 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { 522 unsigned IID = N->getConstantOperandVal(1); 523 switch (IID) { 524 default: 525 return false; 526 case Intrinsic::nvvm_ldg_global_f: 527 case Intrinsic::nvvm_ldg_global_i: 528 case Intrinsic::nvvm_ldg_global_p: 529 case Intrinsic::nvvm_ldu_global_f: 530 case Intrinsic::nvvm_ldu_global_i: 531 case Intrinsic::nvvm_ldu_global_p: 532 return tryLDGLDU(N); 533 } 534 } 535 536 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we 537 // have to load them into an .(b)f16 register first. 538 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { 539 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16) 540 return false; 541 SDValue Val = CurDAG->getTargetConstantFP( 542 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0)); 543 SDNode *LoadConstF16 = CurDAG->getMachineNode( 544 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 545 : NVPTX::LOAD_CONST_BF16), 546 SDLoc(N), N->getValueType(0), Val); 547 ReplaceNode(N, LoadConstF16); 548 return true; 549 } 550 551 // Map ISD:CONDCODE value to appropriate CmpMode expected by 552 // NVPTXInstPrinter::printCmpMode() 553 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { 554 using NVPTX::PTXCmpMode::CmpMode; 555 unsigned PTXCmpMode = [](ISD::CondCode CC) { 556 switch (CC) { 557 default: 558 llvm_unreachable("Unexpected condition code."); 559 case ISD::SETOEQ: 560 return CmpMode::EQ; 561 case ISD::SETOGT: 562 return CmpMode::GT; 563 case ISD::SETOGE: 564 return CmpMode::GE; 565 case ISD::SETOLT: 566 return CmpMode::LT; 567 case ISD::SETOLE: 568 return CmpMode::LE; 569 case ISD::SETONE: 570 return CmpMode::NE; 571 case ISD::SETO: 572 return CmpMode::NUM; 573 case ISD::SETUO: 574 return CmpMode::NotANumber; 575 case ISD::SETUEQ: 576 return CmpMode::EQU; 577 case ISD::SETUGT: 578 return CmpMode::GTU; 579 case ISD::SETUGE: 580 return CmpMode::GEU; 581 case ISD::SETULT: 582 return CmpMode::LTU; 583 case ISD::SETULE: 584 return CmpMode::LEU; 585 case ISD::SETUNE: 586 return CmpMode::NEU; 587 case ISD::SETEQ: 588 return CmpMode::EQ; 589 case ISD::SETGT: 590 return CmpMode::GT; 591 case ISD::SETGE: 592 return CmpMode::GE; 593 case ISD::SETLT: 594 return CmpMode::LT; 595 case ISD::SETLE: 596 return CmpMode::LE; 597 case ISD::SETNE: 598 return CmpMode::NE; 599 } 600 }(CondCode.get()); 601 602 if (FTZ) 603 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; 604 605 return PTXCmpMode; 606 } 607 608 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { 609 unsigned PTXCmpMode = 610 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 611 SDLoc DL(N); 612 SDNode *SetP = CurDAG->getMachineNode( 613 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 614 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 615 ReplaceNode(N, SetP); 616 return true; 617 } 618 619 bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { 620 unsigned PTXCmpMode = 621 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 622 SDLoc DL(N); 623 SDNode *SetP = CurDAG->getMachineNode( 624 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 625 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 626 ReplaceNode(N, SetP); 627 return true; 628 } 629 630 // Find all instances of extract_vector_elt that use this v2f16 vector 631 // and coalesce them into a scattering move instruction. 632 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { 633 SDValue Vector = N->getOperand(0); 634 635 // We only care about 16x2 as it's the only real vector type we 636 // need to deal with. 637 MVT VT = Vector.getSimpleValueType(); 638 if (!Isv2x16VT(VT)) 639 return false; 640 // Find and record all uses of this vector that extract element 0 or 1. 641 SmallVector<SDNode *, 4> E0, E1; 642 for (auto *U : Vector.getNode()->uses()) { 643 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 644 continue; 645 if (U->getOperand(0) != Vector) 646 continue; 647 if (const ConstantSDNode *IdxConst = 648 dyn_cast<ConstantSDNode>(U->getOperand(1))) { 649 if (IdxConst->getZExtValue() == 0) 650 E0.push_back(U); 651 else if (IdxConst->getZExtValue() == 1) 652 E1.push_back(U); 653 else 654 llvm_unreachable("Invalid vector index."); 655 } 656 } 657 658 // There's no point scattering f16x2 if we only ever access one 659 // element of it. 660 if (E0.empty() || E1.empty()) 661 return false; 662 663 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) 664 // into f16,f16 SplitF16x2(V) 665 MVT EltVT = VT.getVectorElementType(); 666 SDNode *ScatterOp = 667 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); 668 for (auto *Node : E0) 669 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); 670 for (auto *Node : E1) 671 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); 672 673 return true; 674 } 675 676 static unsigned int getCodeAddrSpace(MemSDNode *N) { 677 const Value *Src = N->getMemOperand()->getValue(); 678 679 if (!Src) 680 return NVPTX::PTXLdStInstCode::GENERIC; 681 682 if (auto *PT = dyn_cast<PointerType>(Src->getType())) { 683 switch (PT->getAddressSpace()) { 684 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; 685 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; 686 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; 687 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; 688 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; 689 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; 690 default: break; 691 } 692 } 693 return NVPTX::PTXLdStInstCode::GENERIC; 694 } 695 696 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, 697 unsigned CodeAddrSpace, MachineFunction *F) { 698 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address 699 // space. 700 // 701 // We have two ways of identifying invariant loads: Loads may be explicitly 702 // marked as invariant, or we may infer them to be invariant. 703 // 704 // We currently infer invariance for loads from 705 // - constant global variables, and 706 // - kernel function pointer params that are noalias (i.e. __restrict) and 707 // never written to. 708 // 709 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally 710 // not during the SelectionDAG phase). 711 // 712 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for 713 // explicitly invariant loads because these are how clang tells us to use ldg 714 // when the user uses a builtin. 715 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) 716 return false; 717 718 if (N->isInvariant()) 719 return true; 720 721 bool IsKernelFn = isKernelFunction(F->getFunction()); 722 723 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly 724 // because the former looks through phi nodes while the latter does not. We 725 // need to look through phi nodes to handle pointer induction variables. 726 SmallVector<const Value *, 8> Objs; 727 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); 728 729 return all_of(Objs, [&](const Value *V) { 730 if (auto *A = dyn_cast<const Argument>(V)) 731 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); 732 if (auto *GV = dyn_cast<const GlobalVariable>(V)) 733 return GV->isConstant(); 734 return false; 735 }); 736 } 737 738 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { 739 unsigned IID = N->getConstantOperandVal(0); 740 switch (IID) { 741 default: 742 return false; 743 case Intrinsic::nvvm_texsurf_handle_internal: 744 SelectTexSurfHandle(N); 745 return true; 746 } 747 } 748 749 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { 750 // Op 0 is the intrinsic ID 751 SDValue Wrapper = N->getOperand(1); 752 SDValue GlobalVal = Wrapper.getOperand(0); 753 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), 754 MVT::i64, GlobalVal)); 755 } 756 757 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { 758 SDValue Src = N->getOperand(0); 759 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); 760 unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); 761 unsigned DstAddrSpace = CastN->getDestAddressSpace(); 762 assert(SrcAddrSpace != DstAddrSpace && 763 "addrspacecast must be between different address spaces"); 764 765 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { 766 // Specific to generic 767 unsigned Opc; 768 switch (SrcAddrSpace) { 769 default: report_fatal_error("Bad address space in addrspacecast"); 770 case ADDRESS_SPACE_GLOBAL: 771 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes; 772 break; 773 case ADDRESS_SPACE_SHARED: 774 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432 775 : NVPTX::cvta_shared_yes_64) 776 : NVPTX::cvta_shared_yes; 777 break; 778 case ADDRESS_SPACE_CONST: 779 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432 780 : NVPTX::cvta_const_yes_64) 781 : NVPTX::cvta_const_yes; 782 break; 783 case ADDRESS_SPACE_LOCAL: 784 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432 785 : NVPTX::cvta_local_yes_64) 786 : NVPTX::cvta_local_yes; 787 break; 788 } 789 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 790 Src)); 791 return; 792 } else { 793 // Generic to specific 794 if (SrcAddrSpace != 0) 795 report_fatal_error("Cannot cast between two non-generic address spaces"); 796 unsigned Opc; 797 switch (DstAddrSpace) { 798 default: report_fatal_error("Bad address space in addrspacecast"); 799 case ADDRESS_SPACE_GLOBAL: 800 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64 801 : NVPTX::cvta_to_global_yes; 802 break; 803 case ADDRESS_SPACE_SHARED: 804 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264 805 : NVPTX::cvta_to_shared_yes_64) 806 : NVPTX::cvta_to_shared_yes; 807 break; 808 case ADDRESS_SPACE_CONST: 809 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264 810 : NVPTX::cvta_to_const_yes_64) 811 : NVPTX::cvta_to_const_yes; 812 break; 813 case ADDRESS_SPACE_LOCAL: 814 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264 815 : NVPTX::cvta_to_local_yes_64) 816 : NVPTX::cvta_to_local_yes; 817 break; 818 case ADDRESS_SPACE_PARAM: 819 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 820 : NVPTX::nvvm_ptr_gen_to_param; 821 break; 822 } 823 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 824 Src)); 825 return; 826 } 827 } 828 829 // Helper function template to reduce amount of boilerplate code for 830 // opcode selection. 831 static std::optional<unsigned> 832 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, 833 unsigned Opcode_i16, unsigned Opcode_i32, 834 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, 835 std::optional<unsigned> Opcode_f64) { 836 switch (VT) { 837 case MVT::i1: 838 case MVT::i8: 839 return Opcode_i8; 840 case MVT::i16: 841 return Opcode_i16; 842 case MVT::i32: 843 return Opcode_i32; 844 case MVT::i64: 845 return Opcode_i64; 846 case MVT::f16: 847 case MVT::bf16: 848 return Opcode_i16; 849 case MVT::v2f16: 850 case MVT::v2bf16: 851 case MVT::v2i16: 852 case MVT::v4i8: 853 return Opcode_i32; 854 case MVT::f32: 855 return Opcode_f32; 856 case MVT::f64: 857 return Opcode_f64; 858 default: 859 return std::nullopt; 860 } 861 } 862 863 static int getLdStRegType(EVT VT) { 864 if (VT.isFloatingPoint()) 865 switch (VT.getSimpleVT().SimpleTy) { 866 case MVT::f16: 867 case MVT::bf16: 868 case MVT::v2f16: 869 case MVT::v2bf16: 870 return NVPTX::PTXLdStInstCode::Untyped; 871 default: 872 return NVPTX::PTXLdStInstCode::Float; 873 } 874 else 875 return NVPTX::PTXLdStInstCode::Unsigned; 876 } 877 878 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { 879 SDLoc dl(N); 880 MemSDNode *LD = cast<MemSDNode>(N); 881 assert(LD->readMem() && "Expected load"); 882 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); 883 EVT LoadedVT = LD->getMemoryVT(); 884 SDNode *NVPTXLD = nullptr; 885 886 // do not support pre/post inc/dec 887 if (PlainLoad && PlainLoad->isIndexed()) 888 return false; 889 890 if (!LoadedVT.isSimple()) 891 return false; 892 893 AtomicOrdering Ordering = LD->getSuccessOrdering(); 894 // In order to lower atomic loads with stronger guarantees we would need to 895 // use load.acquire or insert fences. However these features were only added 896 // with PTX ISA 6.0 / sm_70. 897 // TODO: Check if we can actually use the new instructions and implement them. 898 if (isStrongerThanMonotonic(Ordering)) 899 return false; 900 901 // Address Space Setting 902 unsigned int CodeAddrSpace = getCodeAddrSpace(LD); 903 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { 904 return tryLDGLDU(N); 905 } 906 907 unsigned int PointerSize = 908 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); 909 910 // Volatile Setting 911 // - .volatile is only available for .global and .shared 912 // - .volatile has the same memory synchronization semantics as .relaxed.sys 913 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; 914 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 915 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 916 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 917 isVolatile = false; 918 919 // Type Setting: fromType + fromTypeWidth 920 // 921 // Sign : ISD::SEXTLOAD 922 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 923 // type is integer 924 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 925 MVT SimpleVT = LoadedVT.getSimpleVT(); 926 MVT ScalarVT = SimpleVT.getScalarType(); 927 // Read at least 8 bits (predicates are stored as 8-bit values) 928 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 929 unsigned int fromType; 930 931 // Vector Setting 932 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 933 if (SimpleVT.isVector()) { 934 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && 935 "Unexpected vector type"); 936 // v2f16/v2bf16/v2i16 is loaded using ld.b32 937 fromTypeWidth = 32; 938 } 939 940 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) 941 fromType = NVPTX::PTXLdStInstCode::Signed; 942 else 943 fromType = getLdStRegType(ScalarVT); 944 945 // Create the machine instruction DAG 946 SDValue Chain = N->getOperand(0); 947 SDValue N1 = N->getOperand(1); 948 SDValue Addr; 949 SDValue Offset, Base; 950 std::optional<unsigned> Opcode; 951 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; 952 953 if (SelectDirectAddr(N1, Addr)) { 954 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, 955 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, 956 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); 957 if (!Opcode) 958 return false; 959 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 960 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 961 getI32Imm(fromTypeWidth, dl), Addr, Chain }; 962 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 963 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) 964 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { 965 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, 966 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, 967 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); 968 if (!Opcode) 969 return false; 970 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 971 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 972 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 973 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 974 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) 975 : SelectADDRri(N1.getNode(), N1, Base, Offset)) { 976 if (PointerSize == 64) 977 Opcode = 978 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, 979 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, 980 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); 981 else 982 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, 983 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari, 984 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); 985 if (!Opcode) 986 return false; 987 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 988 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 989 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 990 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 991 } else { 992 if (PointerSize == 64) 993 Opcode = 994 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, 995 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, 996 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64); 997 else 998 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, 999 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg, 1000 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); 1001 if (!Opcode) 1002 return false; 1003 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 1004 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 1005 getI32Imm(fromTypeWidth, dl), N1, Chain }; 1006 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 1007 } 1008 1009 if (!NVPTXLD) 1010 return false; 1011 1012 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1013 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef}); 1014 1015 ReplaceNode(N, NVPTXLD); 1016 return true; 1017 } 1018 1019 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { 1020 1021 SDValue Chain = N->getOperand(0); 1022 SDValue Op1 = N->getOperand(1); 1023 SDValue Addr, Offset, Base; 1024 std::optional<unsigned> Opcode; 1025 SDLoc DL(N); 1026 SDNode *LD; 1027 MemSDNode *MemSD = cast<MemSDNode>(N); 1028 EVT LoadedVT = MemSD->getMemoryVT(); 1029 1030 if (!LoadedVT.isSimple()) 1031 return false; 1032 1033 // Address Space Setting 1034 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); 1035 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { 1036 return tryLDGLDU(N); 1037 } 1038 1039 unsigned int PointerSize = 1040 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1041 1042 // Volatile Setting 1043 // - .volatile is only availalble for .global and .shared 1044 bool IsVolatile = MemSD->isVolatile(); 1045 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1046 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1047 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1048 IsVolatile = false; 1049 1050 // Vector Setting 1051 MVT SimpleVT = LoadedVT.getSimpleVT(); 1052 1053 // Type Setting: fromType + fromTypeWidth 1054 // 1055 // Sign : ISD::SEXTLOAD 1056 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 1057 // type is integer 1058 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 1059 MVT ScalarVT = SimpleVT.getScalarType(); 1060 // Read at least 8 bits (predicates are stored as 8-bit values) 1061 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 1062 unsigned int FromType; 1063 // The last operand holds the original LoadSDNode::getExtensionType() value 1064 unsigned ExtensionType = cast<ConstantSDNode>( 1065 N->getOperand(N->getNumOperands() - 1))->getZExtValue(); 1066 if (ExtensionType == ISD::SEXTLOAD) 1067 FromType = NVPTX::PTXLdStInstCode::Signed; 1068 else 1069 FromType = getLdStRegType(ScalarVT); 1070 1071 unsigned VecType; 1072 1073 switch (N->getOpcode()) { 1074 case NVPTXISD::LoadV2: 1075 VecType = NVPTX::PTXLdStInstCode::V2; 1076 break; 1077 case NVPTXISD::LoadV4: 1078 VecType = NVPTX::PTXLdStInstCode::V4; 1079 break; 1080 default: 1081 return false; 1082 } 1083 1084 EVT EltVT = N->getValueType(0); 1085 1086 // v8x16 is a special case. PTX doesn't have ld.v8.16 1087 // instruction. Instead, we split the vector into v2x16 chunks and 1088 // load them with ld.v4.b32. 1089 if (Isv2x16VT(EltVT)) { 1090 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); 1091 EltVT = MVT::i32; 1092 FromType = NVPTX::PTXLdStInstCode::Untyped; 1093 FromTypeWidth = 32; 1094 } 1095 1096 if (SelectDirectAddr(Op1, Addr)) { 1097 switch (N->getOpcode()) { 1098 default: 1099 return false; 1100 case NVPTXISD::LoadV2: 1101 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1102 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, 1103 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, 1104 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); 1105 break; 1106 case NVPTXISD::LoadV4: 1107 Opcode = 1108 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, 1109 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, 1110 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt); 1111 break; 1112 } 1113 if (!Opcode) 1114 return false; 1115 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1116 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1117 getI32Imm(FromTypeWidth, DL), Addr, Chain }; 1118 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1119 } else if (PointerSize == 64 1120 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) 1121 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { 1122 switch (N->getOpcode()) { 1123 default: 1124 return false; 1125 case NVPTXISD::LoadV2: 1126 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1127 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, 1128 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, 1129 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); 1130 break; 1131 case NVPTXISD::LoadV4: 1132 Opcode = 1133 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, 1134 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, 1135 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt); 1136 break; 1137 } 1138 if (!Opcode) 1139 return false; 1140 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1141 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1142 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1143 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1144 } else if (PointerSize == 64 1145 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1146 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1147 if (PointerSize == 64) { 1148 switch (N->getOpcode()) { 1149 default: 1150 return false; 1151 case NVPTXISD::LoadV2: 1152 Opcode = 1153 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1154 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64, 1155 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64, 1156 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64); 1157 break; 1158 case NVPTXISD::LoadV4: 1159 Opcode = pickOpcodeForVT( 1160 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, 1161 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt, 1162 NVPTX::LDV_f32_v4_ari_64, std::nullopt); 1163 break; 1164 } 1165 } else { 1166 switch (N->getOpcode()) { 1167 default: 1168 return false; 1169 case NVPTXISD::LoadV2: 1170 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1171 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, 1172 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, 1173 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); 1174 break; 1175 case NVPTXISD::LoadV4: 1176 Opcode = 1177 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, 1178 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, 1179 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt); 1180 break; 1181 } 1182 } 1183 if (!Opcode) 1184 return false; 1185 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1186 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1187 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1188 1189 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1190 } else { 1191 if (PointerSize == 64) { 1192 switch (N->getOpcode()) { 1193 default: 1194 return false; 1195 case NVPTXISD::LoadV2: 1196 Opcode = pickOpcodeForVT( 1197 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, 1198 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, 1199 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, 1200 NVPTX::LDV_f64_v2_areg_64); 1201 break; 1202 case NVPTXISD::LoadV4: 1203 Opcode = pickOpcodeForVT( 1204 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, 1205 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt, 1206 NVPTX::LDV_f32_v4_areg_64, std::nullopt); 1207 break; 1208 } 1209 } else { 1210 switch (N->getOpcode()) { 1211 default: 1212 return false; 1213 case NVPTXISD::LoadV2: 1214 Opcode = 1215 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, 1216 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, 1217 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg, 1218 NVPTX::LDV_f64_v2_areg); 1219 break; 1220 case NVPTXISD::LoadV4: 1221 Opcode = 1222 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, 1223 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, 1224 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt); 1225 break; 1226 } 1227 } 1228 if (!Opcode) 1229 return false; 1230 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1231 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1232 getI32Imm(FromTypeWidth, DL), Op1, Chain }; 1233 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1234 } 1235 1236 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1237 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1238 1239 ReplaceNode(N, LD); 1240 return true; 1241 } 1242 1243 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { 1244 1245 SDValue Chain = N->getOperand(0); 1246 SDValue Op1; 1247 MemSDNode *Mem; 1248 bool IsLDG = true; 1249 1250 // If this is an LDG intrinsic, the address is the third operand. If its an 1251 // LDG/LDU SD node (from custom vector handling), then its the second operand 1252 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1253 Op1 = N->getOperand(2); 1254 Mem = cast<MemIntrinsicSDNode>(N); 1255 unsigned IID = N->getConstantOperandVal(1); 1256 switch (IID) { 1257 default: 1258 return false; 1259 case Intrinsic::nvvm_ldg_global_f: 1260 case Intrinsic::nvvm_ldg_global_i: 1261 case Intrinsic::nvvm_ldg_global_p: 1262 IsLDG = true; 1263 break; 1264 case Intrinsic::nvvm_ldu_global_f: 1265 case Intrinsic::nvvm_ldu_global_i: 1266 case Intrinsic::nvvm_ldu_global_p: 1267 IsLDG = false; 1268 break; 1269 } 1270 } else { 1271 Op1 = N->getOperand(1); 1272 Mem = cast<MemSDNode>(N); 1273 } 1274 1275 std::optional<unsigned> Opcode; 1276 SDLoc DL(N); 1277 SDNode *LD; 1278 SDValue Base, Offset, Addr; 1279 EVT OrigType = N->getValueType(0); 1280 1281 EVT EltVT = Mem->getMemoryVT(); 1282 unsigned NumElts = 1; 1283 if (EltVT.isVector()) { 1284 NumElts = EltVT.getVectorNumElements(); 1285 EltVT = EltVT.getVectorElementType(); 1286 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. 1287 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || 1288 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || 1289 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { 1290 assert(NumElts % 2 == 0 && "Vector must have even number of elements"); 1291 EltVT = OrigType; 1292 NumElts /= 2; 1293 } else if (OrigType == MVT::v4i8) { 1294 EltVT = OrigType; 1295 NumElts = 1; 1296 } 1297 } 1298 1299 // Build the "promoted" result VTList for the load. If we are really loading 1300 // i8s, then the return type will be promoted to i16 since we do not expose 1301 // 8-bit registers in NVPTX. 1302 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; 1303 SmallVector<EVT, 5> InstVTs; 1304 for (unsigned i = 0; i != NumElts; ++i) { 1305 InstVTs.push_back(NodeVT); 1306 } 1307 InstVTs.push_back(MVT::Other); 1308 SDVTList InstVTList = CurDAG->getVTList(InstVTs); 1309 1310 if (SelectDirectAddr(Op1, Addr)) { 1311 switch (N->getOpcode()) { 1312 default: 1313 return false; 1314 case ISD::LOAD: 1315 case ISD::INTRINSIC_W_CHAIN: 1316 if (IsLDG) 1317 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1318 NVPTX::INT_PTX_LDG_GLOBAL_i8avar, 1319 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, 1320 NVPTX::INT_PTX_LDG_GLOBAL_i32avar, 1321 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, 1322 NVPTX::INT_PTX_LDG_GLOBAL_f32avar, 1323 NVPTX::INT_PTX_LDG_GLOBAL_f64avar); 1324 else 1325 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1326 NVPTX::INT_PTX_LDU_GLOBAL_i8avar, 1327 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, 1328 NVPTX::INT_PTX_LDU_GLOBAL_i32avar, 1329 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, 1330 NVPTX::INT_PTX_LDU_GLOBAL_f32avar, 1331 NVPTX::INT_PTX_LDU_GLOBAL_f64avar); 1332 break; 1333 case NVPTXISD::LoadV2: 1334 case NVPTXISD::LDGV2: 1335 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1336 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, 1337 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, 1338 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, 1339 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, 1340 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, 1341 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); 1342 break; 1343 case NVPTXISD::LDUV2: 1344 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1345 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, 1346 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, 1347 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, 1348 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, 1349 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, 1350 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); 1351 break; 1352 case NVPTXISD::LoadV4: 1353 case NVPTXISD::LDGV4: 1354 Opcode = pickOpcodeForVT( 1355 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, 1356 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, 1357 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt, 1358 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt); 1359 break; 1360 case NVPTXISD::LDUV4: 1361 Opcode = pickOpcodeForVT( 1362 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, 1363 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, 1364 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt, 1365 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt); 1366 break; 1367 } 1368 if (!Opcode) 1369 return false; 1370 SDValue Ops[] = { Addr, Chain }; 1371 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1372 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1373 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1374 if (TM.is64Bit()) { 1375 switch (N->getOpcode()) { 1376 default: 1377 return false; 1378 case ISD::LOAD: 1379 case ISD::INTRINSIC_W_CHAIN: 1380 if (IsLDG) 1381 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1382 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, 1383 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, 1384 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, 1385 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, 1386 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, 1387 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); 1388 else 1389 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1390 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, 1391 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, 1392 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, 1393 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, 1394 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, 1395 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); 1396 break; 1397 case NVPTXISD::LoadV2: 1398 case NVPTXISD::LDGV2: 1399 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1400 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, 1401 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, 1402 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, 1403 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, 1404 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, 1405 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); 1406 break; 1407 case NVPTXISD::LDUV2: 1408 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1409 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, 1410 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, 1411 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, 1412 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, 1413 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, 1414 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); 1415 break; 1416 case NVPTXISD::LoadV4: 1417 case NVPTXISD::LDGV4: 1418 Opcode = pickOpcodeForVT( 1419 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, 1420 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, 1421 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt, 1422 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt); 1423 break; 1424 case NVPTXISD::LDUV4: 1425 Opcode = pickOpcodeForVT( 1426 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, 1427 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, 1428 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt, 1429 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt); 1430 break; 1431 } 1432 } else { 1433 switch (N->getOpcode()) { 1434 default: 1435 return false; 1436 case ISD::LOAD: 1437 case ISD::INTRINSIC_W_CHAIN: 1438 if (IsLDG) 1439 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1440 NVPTX::INT_PTX_LDG_GLOBAL_i8ari, 1441 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, 1442 NVPTX::INT_PTX_LDG_GLOBAL_i32ari, 1443 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, 1444 NVPTX::INT_PTX_LDG_GLOBAL_f32ari, 1445 NVPTX::INT_PTX_LDG_GLOBAL_f64ari); 1446 else 1447 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1448 NVPTX::INT_PTX_LDU_GLOBAL_i8ari, 1449 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, 1450 NVPTX::INT_PTX_LDU_GLOBAL_i32ari, 1451 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, 1452 NVPTX::INT_PTX_LDU_GLOBAL_f32ari, 1453 NVPTX::INT_PTX_LDU_GLOBAL_f64ari); 1454 break; 1455 case NVPTXISD::LoadV2: 1456 case NVPTXISD::LDGV2: 1457 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1458 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, 1459 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, 1460 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, 1461 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, 1462 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, 1463 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); 1464 break; 1465 case NVPTXISD::LDUV2: 1466 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1467 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, 1468 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, 1469 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, 1470 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, 1471 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, 1472 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); 1473 break; 1474 case NVPTXISD::LoadV4: 1475 case NVPTXISD::LDGV4: 1476 Opcode = pickOpcodeForVT( 1477 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, 1478 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, 1479 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt, 1480 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt); 1481 break; 1482 case NVPTXISD::LDUV4: 1483 Opcode = pickOpcodeForVT( 1484 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, 1485 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, 1486 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt, 1487 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt); 1488 break; 1489 } 1490 } 1491 if (!Opcode) 1492 return false; 1493 SDValue Ops[] = {Base, Offset, Chain}; 1494 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1495 } else { 1496 if (TM.is64Bit()) { 1497 switch (N->getOpcode()) { 1498 default: 1499 return false; 1500 case ISD::LOAD: 1501 case ISD::INTRINSIC_W_CHAIN: 1502 if (IsLDG) 1503 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1504 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, 1505 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, 1506 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, 1507 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, 1508 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, 1509 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); 1510 else 1511 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1512 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, 1513 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, 1514 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, 1515 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, 1516 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, 1517 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); 1518 break; 1519 case NVPTXISD::LoadV2: 1520 case NVPTXISD::LDGV2: 1521 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1522 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, 1523 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, 1524 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, 1525 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, 1526 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, 1527 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); 1528 break; 1529 case NVPTXISD::LDUV2: 1530 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1531 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, 1532 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, 1533 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, 1534 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, 1535 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, 1536 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); 1537 break; 1538 case NVPTXISD::LoadV4: 1539 case NVPTXISD::LDGV4: 1540 Opcode = pickOpcodeForVT( 1541 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, 1542 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, 1543 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt, 1544 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt); 1545 break; 1546 case NVPTXISD::LDUV4: 1547 Opcode = pickOpcodeForVT( 1548 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, 1549 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, 1550 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt, 1551 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt); 1552 break; 1553 } 1554 } else { 1555 switch (N->getOpcode()) { 1556 default: 1557 return false; 1558 case ISD::LOAD: 1559 case ISD::INTRINSIC_W_CHAIN: 1560 if (IsLDG) 1561 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1562 NVPTX::INT_PTX_LDG_GLOBAL_i8areg, 1563 NVPTX::INT_PTX_LDG_GLOBAL_i16areg, 1564 NVPTX::INT_PTX_LDG_GLOBAL_i32areg, 1565 NVPTX::INT_PTX_LDG_GLOBAL_i64areg, 1566 NVPTX::INT_PTX_LDG_GLOBAL_f32areg, 1567 NVPTX::INT_PTX_LDG_GLOBAL_f64areg); 1568 else 1569 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1570 NVPTX::INT_PTX_LDU_GLOBAL_i8areg, 1571 NVPTX::INT_PTX_LDU_GLOBAL_i16areg, 1572 NVPTX::INT_PTX_LDU_GLOBAL_i32areg, 1573 NVPTX::INT_PTX_LDU_GLOBAL_i64areg, 1574 NVPTX::INT_PTX_LDU_GLOBAL_f32areg, 1575 NVPTX::INT_PTX_LDU_GLOBAL_f64areg); 1576 break; 1577 case NVPTXISD::LoadV2: 1578 case NVPTXISD::LDGV2: 1579 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1580 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, 1581 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, 1582 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, 1583 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, 1584 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, 1585 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); 1586 break; 1587 case NVPTXISD::LDUV2: 1588 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1589 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, 1590 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, 1591 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, 1592 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, 1593 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, 1594 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); 1595 break; 1596 case NVPTXISD::LoadV4: 1597 case NVPTXISD::LDGV4: 1598 Opcode = pickOpcodeForVT( 1599 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, 1600 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, 1601 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt, 1602 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt); 1603 break; 1604 case NVPTXISD::LDUV4: 1605 Opcode = pickOpcodeForVT( 1606 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, 1607 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, 1608 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt, 1609 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt); 1610 break; 1611 } 1612 } 1613 if (!Opcode) 1614 return false; 1615 SDValue Ops[] = { Op1, Chain }; 1616 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1617 } 1618 1619 // For automatic generation of LDG (through SelectLoad[Vector], not the 1620 // intrinsics), we may have an extending load like: 1621 // 1622 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 1623 // 1624 // In this case, the matching logic above will select a load for the original 1625 // memory type (in this case, i8) and our types will not match (the node needs 1626 // to return an i32 in this case). Our LDG/LDU nodes do not support the 1627 // concept of sign-/zero-extension, so emulate it here by adding an explicit 1628 // CVT instruction. Ptxas should clean up any redundancies here. 1629 1630 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); 1631 1632 if (OrigType != EltVT && 1633 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { 1634 // We have an extending-load. The instruction we selected operates on the 1635 // smaller type, but the SDNode we are replacing has the larger type. We 1636 // need to emit a CVT to make the types match. 1637 unsigned CvtOpc = 1638 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); 1639 1640 // For each output value, apply the manual sign/zero-extension and make sure 1641 // all users of the load go through that CVT. 1642 for (unsigned i = 0; i != NumElts; ++i) { 1643 SDValue Res(LD, i); 1644 SDValue OrigVal(N, i); 1645 1646 SDNode *CvtNode = 1647 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, 1648 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, 1649 DL, MVT::i32)); 1650 ReplaceUses(OrigVal, SDValue(CvtNode, 0)); 1651 } 1652 } 1653 1654 ReplaceNode(N, LD); 1655 return true; 1656 } 1657 1658 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { 1659 SDLoc dl(N); 1660 MemSDNode *ST = cast<MemSDNode>(N); 1661 assert(ST->writeMem() && "Expected store"); 1662 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); 1663 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); 1664 assert((PlainStore || AtomicStore) && "Expected store"); 1665 EVT StoreVT = ST->getMemoryVT(); 1666 SDNode *NVPTXST = nullptr; 1667 1668 // do not support pre/post inc/dec 1669 if (PlainStore && PlainStore->isIndexed()) 1670 return false; 1671 1672 if (!StoreVT.isSimple()) 1673 return false; 1674 1675 AtomicOrdering Ordering = ST->getSuccessOrdering(); 1676 // In order to lower atomic loads with stronger guarantees we would need to 1677 // use store.release or insert fences. However these features were only added 1678 // with PTX ISA 6.0 / sm_70. 1679 // TODO: Check if we can actually use the new instructions and implement them. 1680 if (isStrongerThanMonotonic(Ordering)) 1681 return false; 1682 1683 // Address Space Setting 1684 unsigned int CodeAddrSpace = getCodeAddrSpace(ST); 1685 unsigned int PointerSize = 1686 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); 1687 1688 // Volatile Setting 1689 // - .volatile is only available for .global and .shared 1690 // - .volatile has the same memory synchronization semantics as .relaxed.sys 1691 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; 1692 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1693 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1694 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1695 isVolatile = false; 1696 1697 // Vector Setting 1698 MVT SimpleVT = StoreVT.getSimpleVT(); 1699 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 1700 1701 // Type Setting: toType + toTypeWidth 1702 // - for integer type, always use 'u' 1703 // 1704 MVT ScalarVT = SimpleVT.getScalarType(); 1705 unsigned toTypeWidth = ScalarVT.getSizeInBits(); 1706 if (SimpleVT.isVector()) { 1707 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && 1708 "Unexpected vector type"); 1709 // v2x16 is stored using st.b32 1710 toTypeWidth = 32; 1711 } 1712 1713 unsigned int toType = getLdStRegType(ScalarVT); 1714 1715 // Create the machine instruction DAG 1716 SDValue Chain = ST->getChain(); 1717 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); 1718 SDValue BasePtr = ST->getBasePtr(); 1719 SDValue Addr; 1720 SDValue Offset, Base; 1721 std::optional<unsigned> Opcode; 1722 MVT::SimpleValueType SourceVT = 1723 Value.getNode()->getSimpleValueType(0).SimpleTy; 1724 1725 if (SelectDirectAddr(BasePtr, Addr)) { 1726 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, 1727 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, 1728 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); 1729 if (!Opcode) 1730 return false; 1731 SDValue Ops[] = {Value, 1732 getI32Imm(isVolatile, dl), 1733 getI32Imm(CodeAddrSpace, dl), 1734 getI32Imm(vecType, dl), 1735 getI32Imm(toType, dl), 1736 getI32Imm(toTypeWidth, dl), 1737 Addr, 1738 Chain}; 1739 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1740 } else if (PointerSize == 64 1741 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) 1742 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { 1743 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, 1744 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, 1745 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); 1746 if (!Opcode) 1747 return false; 1748 SDValue Ops[] = {Value, 1749 getI32Imm(isVolatile, dl), 1750 getI32Imm(CodeAddrSpace, dl), 1751 getI32Imm(vecType, dl), 1752 getI32Imm(toType, dl), 1753 getI32Imm(toTypeWidth, dl), 1754 Base, 1755 Offset, 1756 Chain}; 1757 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1758 } else if (PointerSize == 64 1759 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) 1760 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { 1761 if (PointerSize == 64) 1762 Opcode = 1763 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, 1764 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, 1765 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); 1766 else 1767 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, 1768 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, 1769 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); 1770 if (!Opcode) 1771 return false; 1772 1773 SDValue Ops[] = {Value, 1774 getI32Imm(isVolatile, dl), 1775 getI32Imm(CodeAddrSpace, dl), 1776 getI32Imm(vecType, dl), 1777 getI32Imm(toType, dl), 1778 getI32Imm(toTypeWidth, dl), 1779 Base, 1780 Offset, 1781 Chain}; 1782 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1783 } else { 1784 if (PointerSize == 64) 1785 Opcode = 1786 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, 1787 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, 1788 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); 1789 else 1790 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, 1791 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, 1792 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); 1793 if (!Opcode) 1794 return false; 1795 SDValue Ops[] = {Value, 1796 getI32Imm(isVolatile, dl), 1797 getI32Imm(CodeAddrSpace, dl), 1798 getI32Imm(vecType, dl), 1799 getI32Imm(toType, dl), 1800 getI32Imm(toTypeWidth, dl), 1801 BasePtr, 1802 Chain}; 1803 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1804 } 1805 1806 if (!NVPTXST) 1807 return false; 1808 1809 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1810 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef}); 1811 ReplaceNode(N, NVPTXST); 1812 return true; 1813 } 1814 1815 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { 1816 SDValue Chain = N->getOperand(0); 1817 SDValue Op1 = N->getOperand(1); 1818 SDValue Addr, Offset, Base; 1819 std::optional<unsigned> Opcode; 1820 SDLoc DL(N); 1821 SDNode *ST; 1822 EVT EltVT = Op1.getValueType(); 1823 MemSDNode *MemSD = cast<MemSDNode>(N); 1824 EVT StoreVT = MemSD->getMemoryVT(); 1825 1826 // Address Space Setting 1827 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); 1828 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { 1829 report_fatal_error("Cannot store to pointer that points to constant " 1830 "memory space"); 1831 } 1832 unsigned int PointerSize = 1833 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1834 1835 // Volatile Setting 1836 // - .volatile is only availalble for .global and .shared 1837 bool IsVolatile = MemSD->isVolatile(); 1838 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1839 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1840 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1841 IsVolatile = false; 1842 1843 // Type Setting: toType + toTypeWidth 1844 // - for integer type, always use 'u' 1845 assert(StoreVT.isSimple() && "Store value is not simple"); 1846 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); 1847 unsigned ToTypeWidth = ScalarVT.getSizeInBits(); 1848 unsigned ToType = getLdStRegType(ScalarVT); 1849 1850 SmallVector<SDValue, 12> StOps; 1851 SDValue N2; 1852 unsigned VecType; 1853 1854 switch (N->getOpcode()) { 1855 case NVPTXISD::StoreV2: 1856 VecType = NVPTX::PTXLdStInstCode::V2; 1857 StOps.push_back(N->getOperand(1)); 1858 StOps.push_back(N->getOperand(2)); 1859 N2 = N->getOperand(3); 1860 break; 1861 case NVPTXISD::StoreV4: 1862 VecType = NVPTX::PTXLdStInstCode::V4; 1863 StOps.push_back(N->getOperand(1)); 1864 StOps.push_back(N->getOperand(2)); 1865 StOps.push_back(N->getOperand(3)); 1866 StOps.push_back(N->getOperand(4)); 1867 N2 = N->getOperand(5); 1868 break; 1869 default: 1870 return false; 1871 } 1872 1873 // v8x16 is a special case. PTX doesn't have st.v8.x16 1874 // instruction. Instead, we split the vector into v2x16 chunks and 1875 // store them with st.v4.b32. 1876 if (Isv2x16VT(EltVT)) { 1877 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); 1878 EltVT = MVT::i32; 1879 ToType = NVPTX::PTXLdStInstCode::Untyped; 1880 ToTypeWidth = 32; 1881 } 1882 1883 StOps.push_back(getI32Imm(IsVolatile, DL)); 1884 StOps.push_back(getI32Imm(CodeAddrSpace, DL)); 1885 StOps.push_back(getI32Imm(VecType, DL)); 1886 StOps.push_back(getI32Imm(ToType, DL)); 1887 StOps.push_back(getI32Imm(ToTypeWidth, DL)); 1888 1889 if (SelectDirectAddr(N2, Addr)) { 1890 switch (N->getOpcode()) { 1891 default: 1892 return false; 1893 case NVPTXISD::StoreV2: 1894 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1895 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, 1896 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, 1897 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); 1898 break; 1899 case NVPTXISD::StoreV4: 1900 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1901 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar, 1902 NVPTX::STV_i32_v4_avar, std::nullopt, 1903 NVPTX::STV_f32_v4_avar, std::nullopt); 1904 break; 1905 } 1906 StOps.push_back(Addr); 1907 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) 1908 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { 1909 switch (N->getOpcode()) { 1910 default: 1911 return false; 1912 case NVPTXISD::StoreV2: 1913 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1914 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, 1915 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, 1916 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); 1917 break; 1918 case NVPTXISD::StoreV4: 1919 Opcode = 1920 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, 1921 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, 1922 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); 1923 break; 1924 } 1925 StOps.push_back(Base); 1926 StOps.push_back(Offset); 1927 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) 1928 : SelectADDRri(N2.getNode(), N2, Base, Offset)) { 1929 if (PointerSize == 64) { 1930 switch (N->getOpcode()) { 1931 default: 1932 return false; 1933 case NVPTXISD::StoreV2: 1934 Opcode = 1935 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1936 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64, 1937 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64, 1938 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64); 1939 break; 1940 case NVPTXISD::StoreV4: 1941 Opcode = pickOpcodeForVT( 1942 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, 1943 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt, 1944 NVPTX::STV_f32_v4_ari_64, std::nullopt); 1945 break; 1946 } 1947 } else { 1948 switch (N->getOpcode()) { 1949 default: 1950 return false; 1951 case NVPTXISD::StoreV2: 1952 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1953 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, 1954 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, 1955 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); 1956 break; 1957 case NVPTXISD::StoreV4: 1958 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1959 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari, 1960 NVPTX::STV_i32_v4_ari, std::nullopt, 1961 NVPTX::STV_f32_v4_ari, std::nullopt); 1962 break; 1963 } 1964 } 1965 StOps.push_back(Base); 1966 StOps.push_back(Offset); 1967 } else { 1968 if (PointerSize == 64) { 1969 switch (N->getOpcode()) { 1970 default: 1971 return false; 1972 case NVPTXISD::StoreV2: 1973 Opcode = pickOpcodeForVT( 1974 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, 1975 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, 1976 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64, 1977 NVPTX::STV_f64_v2_areg_64); 1978 break; 1979 case NVPTXISD::StoreV4: 1980 Opcode = pickOpcodeForVT( 1981 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, 1982 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt, 1983 NVPTX::STV_f32_v4_areg_64, std::nullopt); 1984 break; 1985 } 1986 } else { 1987 switch (N->getOpcode()) { 1988 default: 1989 return false; 1990 case NVPTXISD::StoreV2: 1991 Opcode = 1992 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, 1993 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, 1994 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg, 1995 NVPTX::STV_f64_v2_areg); 1996 break; 1997 case NVPTXISD::StoreV4: 1998 Opcode = 1999 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, 2000 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, 2001 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt); 2002 break; 2003 } 2004 } 2005 StOps.push_back(N2); 2006 } 2007 2008 if (!Opcode) 2009 return false; 2010 2011 StOps.push_back(Chain); 2012 2013 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); 2014 2015 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2016 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); 2017 2018 ReplaceNode(N, ST); 2019 return true; 2020 } 2021 2022 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { 2023 SDValue Chain = Node->getOperand(0); 2024 SDValue Offset = Node->getOperand(2); 2025 SDValue Glue = Node->getOperand(3); 2026 SDLoc DL(Node); 2027 MemSDNode *Mem = cast<MemSDNode>(Node); 2028 2029 unsigned VecSize; 2030 switch (Node->getOpcode()) { 2031 default: 2032 return false; 2033 case NVPTXISD::LoadParam: 2034 VecSize = 1; 2035 break; 2036 case NVPTXISD::LoadParamV2: 2037 VecSize = 2; 2038 break; 2039 case NVPTXISD::LoadParamV4: 2040 VecSize = 4; 2041 break; 2042 } 2043 2044 EVT EltVT = Node->getValueType(0); 2045 EVT MemVT = Mem->getMemoryVT(); 2046 2047 std::optional<unsigned> Opcode; 2048 2049 switch (VecSize) { 2050 default: 2051 return false; 2052 case 1: 2053 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, 2054 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, 2055 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, 2056 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); 2057 break; 2058 case 2: 2059 Opcode = 2060 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, 2061 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, 2062 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32, 2063 NVPTX::LoadParamMemV2F64); 2064 break; 2065 case 4: 2066 Opcode = 2067 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, 2068 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, 2069 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); 2070 break; 2071 } 2072 if (!Opcode) 2073 return false; 2074 2075 SDVTList VTs; 2076 if (VecSize == 1) { 2077 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); 2078 } else if (VecSize == 2) { 2079 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); 2080 } else { 2081 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; 2082 VTs = CurDAG->getVTList(EVTs); 2083 } 2084 2085 unsigned OffsetVal = Offset->getAsZExtVal(); 2086 2087 SmallVector<SDValue, 2> Ops; 2088 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2089 Ops.push_back(Chain); 2090 Ops.push_back(Glue); 2091 2092 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); 2093 return true; 2094 } 2095 2096 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { 2097 SDLoc DL(N); 2098 SDValue Chain = N->getOperand(0); 2099 SDValue Offset = N->getOperand(1); 2100 unsigned OffsetVal = Offset->getAsZExtVal(); 2101 MemSDNode *Mem = cast<MemSDNode>(N); 2102 2103 // How many elements do we have? 2104 unsigned NumElts = 1; 2105 switch (N->getOpcode()) { 2106 default: 2107 return false; 2108 case NVPTXISD::StoreRetval: 2109 NumElts = 1; 2110 break; 2111 case NVPTXISD::StoreRetvalV2: 2112 NumElts = 2; 2113 break; 2114 case NVPTXISD::StoreRetvalV4: 2115 NumElts = 4; 2116 break; 2117 } 2118 2119 // Build vector of operands 2120 SmallVector<SDValue, 6> Ops; 2121 for (unsigned i = 0; i < NumElts; ++i) 2122 Ops.push_back(N->getOperand(i + 2)); 2123 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2124 Ops.push_back(Chain); 2125 2126 // Determine target opcode 2127 // If we have an i1, use an 8-bit store. The lowering code in 2128 // NVPTXISelLowering will have already emitted an upcast. 2129 std::optional<unsigned> Opcode = 0; 2130 switch (NumElts) { 2131 default: 2132 return false; 2133 case 1: 2134 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2135 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, 2136 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, 2137 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); 2138 break; 2139 case 2: 2140 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2141 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, 2142 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, 2143 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); 2144 break; 2145 case 4: 2146 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2147 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, 2148 NVPTX::StoreRetvalV4I32, std::nullopt, 2149 NVPTX::StoreRetvalV4F32, std::nullopt); 2150 break; 2151 } 2152 if (!Opcode) 2153 return false; 2154 2155 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); 2156 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2157 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2158 2159 ReplaceNode(N, Ret); 2160 return true; 2161 } 2162 2163 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { 2164 SDLoc DL(N); 2165 SDValue Chain = N->getOperand(0); 2166 SDValue Param = N->getOperand(1); 2167 unsigned ParamVal = Param->getAsZExtVal(); 2168 SDValue Offset = N->getOperand(2); 2169 unsigned OffsetVal = Offset->getAsZExtVal(); 2170 MemSDNode *Mem = cast<MemSDNode>(N); 2171 SDValue Glue = N->getOperand(N->getNumOperands() - 1); 2172 2173 // How many elements do we have? 2174 unsigned NumElts = 1; 2175 switch (N->getOpcode()) { 2176 default: 2177 return false; 2178 case NVPTXISD::StoreParamU32: 2179 case NVPTXISD::StoreParamS32: 2180 case NVPTXISD::StoreParam: 2181 NumElts = 1; 2182 break; 2183 case NVPTXISD::StoreParamV2: 2184 NumElts = 2; 2185 break; 2186 case NVPTXISD::StoreParamV4: 2187 NumElts = 4; 2188 break; 2189 } 2190 2191 // Build vector of operands 2192 SmallVector<SDValue, 8> Ops; 2193 for (unsigned i = 0; i < NumElts; ++i) 2194 Ops.push_back(N->getOperand(i + 3)); 2195 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); 2196 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2197 Ops.push_back(Chain); 2198 Ops.push_back(Glue); 2199 2200 // Determine target opcode 2201 // If we have an i1, use an 8-bit store. The lowering code in 2202 // NVPTXISelLowering will have already emitted an upcast. 2203 std::optional<unsigned> Opcode = 0; 2204 switch (N->getOpcode()) { 2205 default: 2206 switch (NumElts) { 2207 default: 2208 return false; 2209 case 1: 2210 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2211 NVPTX::StoreParamI8, NVPTX::StoreParamI16, 2212 NVPTX::StoreParamI32, NVPTX::StoreParamI64, 2213 NVPTX::StoreParamF32, NVPTX::StoreParamF64); 2214 break; 2215 case 2: 2216 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2217 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16, 2218 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64, 2219 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64); 2220 break; 2221 case 4: 2222 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2223 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16, 2224 NVPTX::StoreParamV4I32, std::nullopt, 2225 NVPTX::StoreParamV4F32, std::nullopt); 2226 break; 2227 } 2228 if (!Opcode) 2229 return false; 2230 break; 2231 // Special case: if we have a sign-extend/zero-extend node, insert the 2232 // conversion instruction first, and use that as the value operand to 2233 // the selected StoreParam node. 2234 case NVPTXISD::StoreParamU32: { 2235 Opcode = NVPTX::StoreParamI32; 2236 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2237 MVT::i32); 2238 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, 2239 MVT::i32, Ops[0], CvtNone); 2240 Ops[0] = SDValue(Cvt, 0); 2241 break; 2242 } 2243 case NVPTXISD::StoreParamS32: { 2244 Opcode = NVPTX::StoreParamI32; 2245 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2246 MVT::i32); 2247 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, 2248 MVT::i32, Ops[0], CvtNone); 2249 Ops[0] = SDValue(Cvt, 0); 2250 break; 2251 } 2252 } 2253 2254 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 2255 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); 2256 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2257 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2258 2259 ReplaceNode(N, Ret); 2260 return true; 2261 } 2262 2263 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { 2264 unsigned Opc = 0; 2265 2266 switch (N->getOpcode()) { 2267 default: return false; 2268 case NVPTXISD::Tex1DFloatS32: 2269 Opc = NVPTX::TEX_1D_F32_S32_RR; 2270 break; 2271 case NVPTXISD::Tex1DFloatFloat: 2272 Opc = NVPTX::TEX_1D_F32_F32_RR; 2273 break; 2274 case NVPTXISD::Tex1DFloatFloatLevel: 2275 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; 2276 break; 2277 case NVPTXISD::Tex1DFloatFloatGrad: 2278 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; 2279 break; 2280 case NVPTXISD::Tex1DS32S32: 2281 Opc = NVPTX::TEX_1D_S32_S32_RR; 2282 break; 2283 case NVPTXISD::Tex1DS32Float: 2284 Opc = NVPTX::TEX_1D_S32_F32_RR; 2285 break; 2286 case NVPTXISD::Tex1DS32FloatLevel: 2287 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; 2288 break; 2289 case NVPTXISD::Tex1DS32FloatGrad: 2290 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; 2291 break; 2292 case NVPTXISD::Tex1DU32S32: 2293 Opc = NVPTX::TEX_1D_U32_S32_RR; 2294 break; 2295 case NVPTXISD::Tex1DU32Float: 2296 Opc = NVPTX::TEX_1D_U32_F32_RR; 2297 break; 2298 case NVPTXISD::Tex1DU32FloatLevel: 2299 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; 2300 break; 2301 case NVPTXISD::Tex1DU32FloatGrad: 2302 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; 2303 break; 2304 case NVPTXISD::Tex1DArrayFloatS32: 2305 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; 2306 break; 2307 case NVPTXISD::Tex1DArrayFloatFloat: 2308 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; 2309 break; 2310 case NVPTXISD::Tex1DArrayFloatFloatLevel: 2311 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; 2312 break; 2313 case NVPTXISD::Tex1DArrayFloatFloatGrad: 2314 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; 2315 break; 2316 case NVPTXISD::Tex1DArrayS32S32: 2317 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; 2318 break; 2319 case NVPTXISD::Tex1DArrayS32Float: 2320 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; 2321 break; 2322 case NVPTXISD::Tex1DArrayS32FloatLevel: 2323 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; 2324 break; 2325 case NVPTXISD::Tex1DArrayS32FloatGrad: 2326 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; 2327 break; 2328 case NVPTXISD::Tex1DArrayU32S32: 2329 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; 2330 break; 2331 case NVPTXISD::Tex1DArrayU32Float: 2332 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; 2333 break; 2334 case NVPTXISD::Tex1DArrayU32FloatLevel: 2335 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; 2336 break; 2337 case NVPTXISD::Tex1DArrayU32FloatGrad: 2338 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; 2339 break; 2340 case NVPTXISD::Tex2DFloatS32: 2341 Opc = NVPTX::TEX_2D_F32_S32_RR; 2342 break; 2343 case NVPTXISD::Tex2DFloatFloat: 2344 Opc = NVPTX::TEX_2D_F32_F32_RR; 2345 break; 2346 case NVPTXISD::Tex2DFloatFloatLevel: 2347 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; 2348 break; 2349 case NVPTXISD::Tex2DFloatFloatGrad: 2350 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; 2351 break; 2352 case NVPTXISD::Tex2DS32S32: 2353 Opc = NVPTX::TEX_2D_S32_S32_RR; 2354 break; 2355 case NVPTXISD::Tex2DS32Float: 2356 Opc = NVPTX::TEX_2D_S32_F32_RR; 2357 break; 2358 case NVPTXISD::Tex2DS32FloatLevel: 2359 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; 2360 break; 2361 case NVPTXISD::Tex2DS32FloatGrad: 2362 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; 2363 break; 2364 case NVPTXISD::Tex2DU32S32: 2365 Opc = NVPTX::TEX_2D_U32_S32_RR; 2366 break; 2367 case NVPTXISD::Tex2DU32Float: 2368 Opc = NVPTX::TEX_2D_U32_F32_RR; 2369 break; 2370 case NVPTXISD::Tex2DU32FloatLevel: 2371 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; 2372 break; 2373 case NVPTXISD::Tex2DU32FloatGrad: 2374 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; 2375 break; 2376 case NVPTXISD::Tex2DArrayFloatS32: 2377 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; 2378 break; 2379 case NVPTXISD::Tex2DArrayFloatFloat: 2380 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; 2381 break; 2382 case NVPTXISD::Tex2DArrayFloatFloatLevel: 2383 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; 2384 break; 2385 case NVPTXISD::Tex2DArrayFloatFloatGrad: 2386 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; 2387 break; 2388 case NVPTXISD::Tex2DArrayS32S32: 2389 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; 2390 break; 2391 case NVPTXISD::Tex2DArrayS32Float: 2392 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; 2393 break; 2394 case NVPTXISD::Tex2DArrayS32FloatLevel: 2395 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; 2396 break; 2397 case NVPTXISD::Tex2DArrayS32FloatGrad: 2398 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; 2399 break; 2400 case NVPTXISD::Tex2DArrayU32S32: 2401 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; 2402 break; 2403 case NVPTXISD::Tex2DArrayU32Float: 2404 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; 2405 break; 2406 case NVPTXISD::Tex2DArrayU32FloatLevel: 2407 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; 2408 break; 2409 case NVPTXISD::Tex2DArrayU32FloatGrad: 2410 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; 2411 break; 2412 case NVPTXISD::Tex3DFloatS32: 2413 Opc = NVPTX::TEX_3D_F32_S32_RR; 2414 break; 2415 case NVPTXISD::Tex3DFloatFloat: 2416 Opc = NVPTX::TEX_3D_F32_F32_RR; 2417 break; 2418 case NVPTXISD::Tex3DFloatFloatLevel: 2419 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; 2420 break; 2421 case NVPTXISD::Tex3DFloatFloatGrad: 2422 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; 2423 break; 2424 case NVPTXISD::Tex3DS32S32: 2425 Opc = NVPTX::TEX_3D_S32_S32_RR; 2426 break; 2427 case NVPTXISD::Tex3DS32Float: 2428 Opc = NVPTX::TEX_3D_S32_F32_RR; 2429 break; 2430 case NVPTXISD::Tex3DS32FloatLevel: 2431 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; 2432 break; 2433 case NVPTXISD::Tex3DS32FloatGrad: 2434 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; 2435 break; 2436 case NVPTXISD::Tex3DU32S32: 2437 Opc = NVPTX::TEX_3D_U32_S32_RR; 2438 break; 2439 case NVPTXISD::Tex3DU32Float: 2440 Opc = NVPTX::TEX_3D_U32_F32_RR; 2441 break; 2442 case NVPTXISD::Tex3DU32FloatLevel: 2443 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; 2444 break; 2445 case NVPTXISD::Tex3DU32FloatGrad: 2446 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; 2447 break; 2448 case NVPTXISD::TexCubeFloatFloat: 2449 Opc = NVPTX::TEX_CUBE_F32_F32_RR; 2450 break; 2451 case NVPTXISD::TexCubeFloatFloatLevel: 2452 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; 2453 break; 2454 case NVPTXISD::TexCubeS32Float: 2455 Opc = NVPTX::TEX_CUBE_S32_F32_RR; 2456 break; 2457 case NVPTXISD::TexCubeS32FloatLevel: 2458 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; 2459 break; 2460 case NVPTXISD::TexCubeU32Float: 2461 Opc = NVPTX::TEX_CUBE_U32_F32_RR; 2462 break; 2463 case NVPTXISD::TexCubeU32FloatLevel: 2464 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; 2465 break; 2466 case NVPTXISD::TexCubeArrayFloatFloat: 2467 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; 2468 break; 2469 case NVPTXISD::TexCubeArrayFloatFloatLevel: 2470 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; 2471 break; 2472 case NVPTXISD::TexCubeArrayS32Float: 2473 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; 2474 break; 2475 case NVPTXISD::TexCubeArrayS32FloatLevel: 2476 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; 2477 break; 2478 case NVPTXISD::TexCubeArrayU32Float: 2479 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; 2480 break; 2481 case NVPTXISD::TexCubeArrayU32FloatLevel: 2482 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; 2483 break; 2484 case NVPTXISD::Tld4R2DFloatFloat: 2485 Opc = NVPTX::TLD4_R_2D_F32_F32_RR; 2486 break; 2487 case NVPTXISD::Tld4G2DFloatFloat: 2488 Opc = NVPTX::TLD4_G_2D_F32_F32_RR; 2489 break; 2490 case NVPTXISD::Tld4B2DFloatFloat: 2491 Opc = NVPTX::TLD4_B_2D_F32_F32_RR; 2492 break; 2493 case NVPTXISD::Tld4A2DFloatFloat: 2494 Opc = NVPTX::TLD4_A_2D_F32_F32_RR; 2495 break; 2496 case NVPTXISD::Tld4R2DS64Float: 2497 Opc = NVPTX::TLD4_R_2D_S32_F32_RR; 2498 break; 2499 case NVPTXISD::Tld4G2DS64Float: 2500 Opc = NVPTX::TLD4_G_2D_S32_F32_RR; 2501 break; 2502 case NVPTXISD::Tld4B2DS64Float: 2503 Opc = NVPTX::TLD4_B_2D_S32_F32_RR; 2504 break; 2505 case NVPTXISD::Tld4A2DS64Float: 2506 Opc = NVPTX::TLD4_A_2D_S32_F32_RR; 2507 break; 2508 case NVPTXISD::Tld4R2DU64Float: 2509 Opc = NVPTX::TLD4_R_2D_U32_F32_RR; 2510 break; 2511 case NVPTXISD::Tld4G2DU64Float: 2512 Opc = NVPTX::TLD4_G_2D_U32_F32_RR; 2513 break; 2514 case NVPTXISD::Tld4B2DU64Float: 2515 Opc = NVPTX::TLD4_B_2D_U32_F32_RR; 2516 break; 2517 case NVPTXISD::Tld4A2DU64Float: 2518 Opc = NVPTX::TLD4_A_2D_U32_F32_RR; 2519 break; 2520 case NVPTXISD::TexUnified1DFloatS32: 2521 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; 2522 break; 2523 case NVPTXISD::TexUnified1DFloatFloat: 2524 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; 2525 break; 2526 case NVPTXISD::TexUnified1DFloatFloatLevel: 2527 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; 2528 break; 2529 case NVPTXISD::TexUnified1DFloatFloatGrad: 2530 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; 2531 break; 2532 case NVPTXISD::TexUnified1DS32S32: 2533 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; 2534 break; 2535 case NVPTXISD::TexUnified1DS32Float: 2536 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; 2537 break; 2538 case NVPTXISD::TexUnified1DS32FloatLevel: 2539 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; 2540 break; 2541 case NVPTXISD::TexUnified1DS32FloatGrad: 2542 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; 2543 break; 2544 case NVPTXISD::TexUnified1DU32S32: 2545 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; 2546 break; 2547 case NVPTXISD::TexUnified1DU32Float: 2548 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; 2549 break; 2550 case NVPTXISD::TexUnified1DU32FloatLevel: 2551 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; 2552 break; 2553 case NVPTXISD::TexUnified1DU32FloatGrad: 2554 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; 2555 break; 2556 case NVPTXISD::TexUnified1DArrayFloatS32: 2557 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; 2558 break; 2559 case NVPTXISD::TexUnified1DArrayFloatFloat: 2560 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; 2561 break; 2562 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 2563 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; 2564 break; 2565 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 2566 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; 2567 break; 2568 case NVPTXISD::TexUnified1DArrayS32S32: 2569 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; 2570 break; 2571 case NVPTXISD::TexUnified1DArrayS32Float: 2572 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; 2573 break; 2574 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 2575 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; 2576 break; 2577 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 2578 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; 2579 break; 2580 case NVPTXISD::TexUnified1DArrayU32S32: 2581 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; 2582 break; 2583 case NVPTXISD::TexUnified1DArrayU32Float: 2584 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; 2585 break; 2586 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 2587 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; 2588 break; 2589 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 2590 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; 2591 break; 2592 case NVPTXISD::TexUnified2DFloatS32: 2593 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; 2594 break; 2595 case NVPTXISD::TexUnified2DFloatFloat: 2596 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; 2597 break; 2598 case NVPTXISD::TexUnified2DFloatFloatLevel: 2599 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; 2600 break; 2601 case NVPTXISD::TexUnified2DFloatFloatGrad: 2602 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; 2603 break; 2604 case NVPTXISD::TexUnified2DS32S32: 2605 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; 2606 break; 2607 case NVPTXISD::TexUnified2DS32Float: 2608 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; 2609 break; 2610 case NVPTXISD::TexUnified2DS32FloatLevel: 2611 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; 2612 break; 2613 case NVPTXISD::TexUnified2DS32FloatGrad: 2614 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; 2615 break; 2616 case NVPTXISD::TexUnified2DU32S32: 2617 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; 2618 break; 2619 case NVPTXISD::TexUnified2DU32Float: 2620 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; 2621 break; 2622 case NVPTXISD::TexUnified2DU32FloatLevel: 2623 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; 2624 break; 2625 case NVPTXISD::TexUnified2DU32FloatGrad: 2626 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; 2627 break; 2628 case NVPTXISD::TexUnified2DArrayFloatS32: 2629 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; 2630 break; 2631 case NVPTXISD::TexUnified2DArrayFloatFloat: 2632 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; 2633 break; 2634 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 2635 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; 2636 break; 2637 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 2638 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; 2639 break; 2640 case NVPTXISD::TexUnified2DArrayS32S32: 2641 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; 2642 break; 2643 case NVPTXISD::TexUnified2DArrayS32Float: 2644 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; 2645 break; 2646 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 2647 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; 2648 break; 2649 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 2650 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; 2651 break; 2652 case NVPTXISD::TexUnified2DArrayU32S32: 2653 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; 2654 break; 2655 case NVPTXISD::TexUnified2DArrayU32Float: 2656 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; 2657 break; 2658 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 2659 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; 2660 break; 2661 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 2662 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; 2663 break; 2664 case NVPTXISD::TexUnified3DFloatS32: 2665 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; 2666 break; 2667 case NVPTXISD::TexUnified3DFloatFloat: 2668 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; 2669 break; 2670 case NVPTXISD::TexUnified3DFloatFloatLevel: 2671 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; 2672 break; 2673 case NVPTXISD::TexUnified3DFloatFloatGrad: 2674 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; 2675 break; 2676 case NVPTXISD::TexUnified3DS32S32: 2677 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; 2678 break; 2679 case NVPTXISD::TexUnified3DS32Float: 2680 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; 2681 break; 2682 case NVPTXISD::TexUnified3DS32FloatLevel: 2683 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; 2684 break; 2685 case NVPTXISD::TexUnified3DS32FloatGrad: 2686 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; 2687 break; 2688 case NVPTXISD::TexUnified3DU32S32: 2689 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; 2690 break; 2691 case NVPTXISD::TexUnified3DU32Float: 2692 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; 2693 break; 2694 case NVPTXISD::TexUnified3DU32FloatLevel: 2695 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; 2696 break; 2697 case NVPTXISD::TexUnified3DU32FloatGrad: 2698 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; 2699 break; 2700 case NVPTXISD::TexUnifiedCubeFloatFloat: 2701 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; 2702 break; 2703 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 2704 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; 2705 break; 2706 case NVPTXISD::TexUnifiedCubeS32Float: 2707 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; 2708 break; 2709 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 2710 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; 2711 break; 2712 case NVPTXISD::TexUnifiedCubeU32Float: 2713 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; 2714 break; 2715 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 2716 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; 2717 break; 2718 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 2719 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; 2720 break; 2721 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 2722 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; 2723 break; 2724 case NVPTXISD::TexUnifiedCubeArrayS32Float: 2725 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; 2726 break; 2727 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 2728 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; 2729 break; 2730 case NVPTXISD::TexUnifiedCubeArrayU32Float: 2731 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; 2732 break; 2733 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 2734 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; 2735 break; 2736 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 2737 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; 2738 break; 2739 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 2740 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; 2741 break; 2742 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 2743 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; 2744 break; 2745 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 2746 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; 2747 break; 2748 case NVPTXISD::Tld4UnifiedR2DS64Float: 2749 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; 2750 break; 2751 case NVPTXISD::Tld4UnifiedG2DS64Float: 2752 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; 2753 break; 2754 case NVPTXISD::Tld4UnifiedB2DS64Float: 2755 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; 2756 break; 2757 case NVPTXISD::Tld4UnifiedA2DS64Float: 2758 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; 2759 break; 2760 case NVPTXISD::Tld4UnifiedR2DU64Float: 2761 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; 2762 break; 2763 case NVPTXISD::Tld4UnifiedG2DU64Float: 2764 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; 2765 break; 2766 case NVPTXISD::Tld4UnifiedB2DU64Float: 2767 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; 2768 break; 2769 case NVPTXISD::Tld4UnifiedA2DU64Float: 2770 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; 2771 break; 2772 case NVPTXISD::TexUnifiedCubeFloatFloatGrad: 2773 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R; 2774 break; 2775 case NVPTXISD::TexUnifiedCubeS32FloatGrad: 2776 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R; 2777 break; 2778 case NVPTXISD::TexUnifiedCubeU32FloatGrad: 2779 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R; 2780 break; 2781 case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: 2782 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R; 2783 break; 2784 case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: 2785 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R; 2786 break; 2787 case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: 2788 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R; 2789 break; 2790 } 2791 2792 // Copy over operands 2793 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 2794 Ops.push_back(N->getOperand(0)); // Move chain to the back. 2795 2796 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 2797 return true; 2798 } 2799 2800 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { 2801 unsigned Opc = 0; 2802 switch (N->getOpcode()) { 2803 default: return false; 2804 case NVPTXISD::Suld1DI8Clamp: 2805 Opc = NVPTX::SULD_1D_I8_CLAMP_R; 2806 break; 2807 case NVPTXISD::Suld1DI16Clamp: 2808 Opc = NVPTX::SULD_1D_I16_CLAMP_R; 2809 break; 2810 case NVPTXISD::Suld1DI32Clamp: 2811 Opc = NVPTX::SULD_1D_I32_CLAMP_R; 2812 break; 2813 case NVPTXISD::Suld1DI64Clamp: 2814 Opc = NVPTX::SULD_1D_I64_CLAMP_R; 2815 break; 2816 case NVPTXISD::Suld1DV2I8Clamp: 2817 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; 2818 break; 2819 case NVPTXISD::Suld1DV2I16Clamp: 2820 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; 2821 break; 2822 case NVPTXISD::Suld1DV2I32Clamp: 2823 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; 2824 break; 2825 case NVPTXISD::Suld1DV2I64Clamp: 2826 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; 2827 break; 2828 case NVPTXISD::Suld1DV4I8Clamp: 2829 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; 2830 break; 2831 case NVPTXISD::Suld1DV4I16Clamp: 2832 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; 2833 break; 2834 case NVPTXISD::Suld1DV4I32Clamp: 2835 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; 2836 break; 2837 case NVPTXISD::Suld1DArrayI8Clamp: 2838 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; 2839 break; 2840 case NVPTXISD::Suld1DArrayI16Clamp: 2841 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; 2842 break; 2843 case NVPTXISD::Suld1DArrayI32Clamp: 2844 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; 2845 break; 2846 case NVPTXISD::Suld1DArrayI64Clamp: 2847 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; 2848 break; 2849 case NVPTXISD::Suld1DArrayV2I8Clamp: 2850 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; 2851 break; 2852 case NVPTXISD::Suld1DArrayV2I16Clamp: 2853 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; 2854 break; 2855 case NVPTXISD::Suld1DArrayV2I32Clamp: 2856 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; 2857 break; 2858 case NVPTXISD::Suld1DArrayV2I64Clamp: 2859 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; 2860 break; 2861 case NVPTXISD::Suld1DArrayV4I8Clamp: 2862 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; 2863 break; 2864 case NVPTXISD::Suld1DArrayV4I16Clamp: 2865 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; 2866 break; 2867 case NVPTXISD::Suld1DArrayV4I32Clamp: 2868 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; 2869 break; 2870 case NVPTXISD::Suld2DI8Clamp: 2871 Opc = NVPTX::SULD_2D_I8_CLAMP_R; 2872 break; 2873 case NVPTXISD::Suld2DI16Clamp: 2874 Opc = NVPTX::SULD_2D_I16_CLAMP_R; 2875 break; 2876 case NVPTXISD::Suld2DI32Clamp: 2877 Opc = NVPTX::SULD_2D_I32_CLAMP_R; 2878 break; 2879 case NVPTXISD::Suld2DI64Clamp: 2880 Opc = NVPTX::SULD_2D_I64_CLAMP_R; 2881 break; 2882 case NVPTXISD::Suld2DV2I8Clamp: 2883 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; 2884 break; 2885 case NVPTXISD::Suld2DV2I16Clamp: 2886 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; 2887 break; 2888 case NVPTXISD::Suld2DV2I32Clamp: 2889 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; 2890 break; 2891 case NVPTXISD::Suld2DV2I64Clamp: 2892 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; 2893 break; 2894 case NVPTXISD::Suld2DV4I8Clamp: 2895 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; 2896 break; 2897 case NVPTXISD::Suld2DV4I16Clamp: 2898 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; 2899 break; 2900 case NVPTXISD::Suld2DV4I32Clamp: 2901 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; 2902 break; 2903 case NVPTXISD::Suld2DArrayI8Clamp: 2904 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; 2905 break; 2906 case NVPTXISD::Suld2DArrayI16Clamp: 2907 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; 2908 break; 2909 case NVPTXISD::Suld2DArrayI32Clamp: 2910 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; 2911 break; 2912 case NVPTXISD::Suld2DArrayI64Clamp: 2913 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; 2914 break; 2915 case NVPTXISD::Suld2DArrayV2I8Clamp: 2916 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; 2917 break; 2918 case NVPTXISD::Suld2DArrayV2I16Clamp: 2919 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; 2920 break; 2921 case NVPTXISD::Suld2DArrayV2I32Clamp: 2922 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; 2923 break; 2924 case NVPTXISD::Suld2DArrayV2I64Clamp: 2925 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; 2926 break; 2927 case NVPTXISD::Suld2DArrayV4I8Clamp: 2928 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; 2929 break; 2930 case NVPTXISD::Suld2DArrayV4I16Clamp: 2931 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; 2932 break; 2933 case NVPTXISD::Suld2DArrayV4I32Clamp: 2934 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; 2935 break; 2936 case NVPTXISD::Suld3DI8Clamp: 2937 Opc = NVPTX::SULD_3D_I8_CLAMP_R; 2938 break; 2939 case NVPTXISD::Suld3DI16Clamp: 2940 Opc = NVPTX::SULD_3D_I16_CLAMP_R; 2941 break; 2942 case NVPTXISD::Suld3DI32Clamp: 2943 Opc = NVPTX::SULD_3D_I32_CLAMP_R; 2944 break; 2945 case NVPTXISD::Suld3DI64Clamp: 2946 Opc = NVPTX::SULD_3D_I64_CLAMP_R; 2947 break; 2948 case NVPTXISD::Suld3DV2I8Clamp: 2949 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; 2950 break; 2951 case NVPTXISD::Suld3DV2I16Clamp: 2952 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; 2953 break; 2954 case NVPTXISD::Suld3DV2I32Clamp: 2955 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; 2956 break; 2957 case NVPTXISD::Suld3DV2I64Clamp: 2958 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; 2959 break; 2960 case NVPTXISD::Suld3DV4I8Clamp: 2961 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; 2962 break; 2963 case NVPTXISD::Suld3DV4I16Clamp: 2964 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; 2965 break; 2966 case NVPTXISD::Suld3DV4I32Clamp: 2967 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; 2968 break; 2969 case NVPTXISD::Suld1DI8Trap: 2970 Opc = NVPTX::SULD_1D_I8_TRAP_R; 2971 break; 2972 case NVPTXISD::Suld1DI16Trap: 2973 Opc = NVPTX::SULD_1D_I16_TRAP_R; 2974 break; 2975 case NVPTXISD::Suld1DI32Trap: 2976 Opc = NVPTX::SULD_1D_I32_TRAP_R; 2977 break; 2978 case NVPTXISD::Suld1DI64Trap: 2979 Opc = NVPTX::SULD_1D_I64_TRAP_R; 2980 break; 2981 case NVPTXISD::Suld1DV2I8Trap: 2982 Opc = NVPTX::SULD_1D_V2I8_TRAP_R; 2983 break; 2984 case NVPTXISD::Suld1DV2I16Trap: 2985 Opc = NVPTX::SULD_1D_V2I16_TRAP_R; 2986 break; 2987 case NVPTXISD::Suld1DV2I32Trap: 2988 Opc = NVPTX::SULD_1D_V2I32_TRAP_R; 2989 break; 2990 case NVPTXISD::Suld1DV2I64Trap: 2991 Opc = NVPTX::SULD_1D_V2I64_TRAP_R; 2992 break; 2993 case NVPTXISD::Suld1DV4I8Trap: 2994 Opc = NVPTX::SULD_1D_V4I8_TRAP_R; 2995 break; 2996 case NVPTXISD::Suld1DV4I16Trap: 2997 Opc = NVPTX::SULD_1D_V4I16_TRAP_R; 2998 break; 2999 case NVPTXISD::Suld1DV4I32Trap: 3000 Opc = NVPTX::SULD_1D_V4I32_TRAP_R; 3001 break; 3002 case NVPTXISD::Suld1DArrayI8Trap: 3003 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; 3004 break; 3005 case NVPTXISD::Suld1DArrayI16Trap: 3006 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; 3007 break; 3008 case NVPTXISD::Suld1DArrayI32Trap: 3009 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; 3010 break; 3011 case NVPTXISD::Suld1DArrayI64Trap: 3012 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; 3013 break; 3014 case NVPTXISD::Suld1DArrayV2I8Trap: 3015 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; 3016 break; 3017 case NVPTXISD::Suld1DArrayV2I16Trap: 3018 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; 3019 break; 3020 case NVPTXISD::Suld1DArrayV2I32Trap: 3021 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; 3022 break; 3023 case NVPTXISD::Suld1DArrayV2I64Trap: 3024 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; 3025 break; 3026 case NVPTXISD::Suld1DArrayV4I8Trap: 3027 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; 3028 break; 3029 case NVPTXISD::Suld1DArrayV4I16Trap: 3030 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; 3031 break; 3032 case NVPTXISD::Suld1DArrayV4I32Trap: 3033 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; 3034 break; 3035 case NVPTXISD::Suld2DI8Trap: 3036 Opc = NVPTX::SULD_2D_I8_TRAP_R; 3037 break; 3038 case NVPTXISD::Suld2DI16Trap: 3039 Opc = NVPTX::SULD_2D_I16_TRAP_R; 3040 break; 3041 case NVPTXISD::Suld2DI32Trap: 3042 Opc = NVPTX::SULD_2D_I32_TRAP_R; 3043 break; 3044 case NVPTXISD::Suld2DI64Trap: 3045 Opc = NVPTX::SULD_2D_I64_TRAP_R; 3046 break; 3047 case NVPTXISD::Suld2DV2I8Trap: 3048 Opc = NVPTX::SULD_2D_V2I8_TRAP_R; 3049 break; 3050 case NVPTXISD::Suld2DV2I16Trap: 3051 Opc = NVPTX::SULD_2D_V2I16_TRAP_R; 3052 break; 3053 case NVPTXISD::Suld2DV2I32Trap: 3054 Opc = NVPTX::SULD_2D_V2I32_TRAP_R; 3055 break; 3056 case NVPTXISD::Suld2DV2I64Trap: 3057 Opc = NVPTX::SULD_2D_V2I64_TRAP_R; 3058 break; 3059 case NVPTXISD::Suld2DV4I8Trap: 3060 Opc = NVPTX::SULD_2D_V4I8_TRAP_R; 3061 break; 3062 case NVPTXISD::Suld2DV4I16Trap: 3063 Opc = NVPTX::SULD_2D_V4I16_TRAP_R; 3064 break; 3065 case NVPTXISD::Suld2DV4I32Trap: 3066 Opc = NVPTX::SULD_2D_V4I32_TRAP_R; 3067 break; 3068 case NVPTXISD::Suld2DArrayI8Trap: 3069 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; 3070 break; 3071 case NVPTXISD::Suld2DArrayI16Trap: 3072 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; 3073 break; 3074 case NVPTXISD::Suld2DArrayI32Trap: 3075 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; 3076 break; 3077 case NVPTXISD::Suld2DArrayI64Trap: 3078 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; 3079 break; 3080 case NVPTXISD::Suld2DArrayV2I8Trap: 3081 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; 3082 break; 3083 case NVPTXISD::Suld2DArrayV2I16Trap: 3084 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; 3085 break; 3086 case NVPTXISD::Suld2DArrayV2I32Trap: 3087 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; 3088 break; 3089 case NVPTXISD::Suld2DArrayV2I64Trap: 3090 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; 3091 break; 3092 case NVPTXISD::Suld2DArrayV4I8Trap: 3093 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; 3094 break; 3095 case NVPTXISD::Suld2DArrayV4I16Trap: 3096 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; 3097 break; 3098 case NVPTXISD::Suld2DArrayV4I32Trap: 3099 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; 3100 break; 3101 case NVPTXISD::Suld3DI8Trap: 3102 Opc = NVPTX::SULD_3D_I8_TRAP_R; 3103 break; 3104 case NVPTXISD::Suld3DI16Trap: 3105 Opc = NVPTX::SULD_3D_I16_TRAP_R; 3106 break; 3107 case NVPTXISD::Suld3DI32Trap: 3108 Opc = NVPTX::SULD_3D_I32_TRAP_R; 3109 break; 3110 case NVPTXISD::Suld3DI64Trap: 3111 Opc = NVPTX::SULD_3D_I64_TRAP_R; 3112 break; 3113 case NVPTXISD::Suld3DV2I8Trap: 3114 Opc = NVPTX::SULD_3D_V2I8_TRAP_R; 3115 break; 3116 case NVPTXISD::Suld3DV2I16Trap: 3117 Opc = NVPTX::SULD_3D_V2I16_TRAP_R; 3118 break; 3119 case NVPTXISD::Suld3DV2I32Trap: 3120 Opc = NVPTX::SULD_3D_V2I32_TRAP_R; 3121 break; 3122 case NVPTXISD::Suld3DV2I64Trap: 3123 Opc = NVPTX::SULD_3D_V2I64_TRAP_R; 3124 break; 3125 case NVPTXISD::Suld3DV4I8Trap: 3126 Opc = NVPTX::SULD_3D_V4I8_TRAP_R; 3127 break; 3128 case NVPTXISD::Suld3DV4I16Trap: 3129 Opc = NVPTX::SULD_3D_V4I16_TRAP_R; 3130 break; 3131 case NVPTXISD::Suld3DV4I32Trap: 3132 Opc = NVPTX::SULD_3D_V4I32_TRAP_R; 3133 break; 3134 case NVPTXISD::Suld1DI8Zero: 3135 Opc = NVPTX::SULD_1D_I8_ZERO_R; 3136 break; 3137 case NVPTXISD::Suld1DI16Zero: 3138 Opc = NVPTX::SULD_1D_I16_ZERO_R; 3139 break; 3140 case NVPTXISD::Suld1DI32Zero: 3141 Opc = NVPTX::SULD_1D_I32_ZERO_R; 3142 break; 3143 case NVPTXISD::Suld1DI64Zero: 3144 Opc = NVPTX::SULD_1D_I64_ZERO_R; 3145 break; 3146 case NVPTXISD::Suld1DV2I8Zero: 3147 Opc = NVPTX::SULD_1D_V2I8_ZERO_R; 3148 break; 3149 case NVPTXISD::Suld1DV2I16Zero: 3150 Opc = NVPTX::SULD_1D_V2I16_ZERO_R; 3151 break; 3152 case NVPTXISD::Suld1DV2I32Zero: 3153 Opc = NVPTX::SULD_1D_V2I32_ZERO_R; 3154 break; 3155 case NVPTXISD::Suld1DV2I64Zero: 3156 Opc = NVPTX::SULD_1D_V2I64_ZERO_R; 3157 break; 3158 case NVPTXISD::Suld1DV4I8Zero: 3159 Opc = NVPTX::SULD_1D_V4I8_ZERO_R; 3160 break; 3161 case NVPTXISD::Suld1DV4I16Zero: 3162 Opc = NVPTX::SULD_1D_V4I16_ZERO_R; 3163 break; 3164 case NVPTXISD::Suld1DV4I32Zero: 3165 Opc = NVPTX::SULD_1D_V4I32_ZERO_R; 3166 break; 3167 case NVPTXISD::Suld1DArrayI8Zero: 3168 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; 3169 break; 3170 case NVPTXISD::Suld1DArrayI16Zero: 3171 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; 3172 break; 3173 case NVPTXISD::Suld1DArrayI32Zero: 3174 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; 3175 break; 3176 case NVPTXISD::Suld1DArrayI64Zero: 3177 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; 3178 break; 3179 case NVPTXISD::Suld1DArrayV2I8Zero: 3180 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; 3181 break; 3182 case NVPTXISD::Suld1DArrayV2I16Zero: 3183 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; 3184 break; 3185 case NVPTXISD::Suld1DArrayV2I32Zero: 3186 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; 3187 break; 3188 case NVPTXISD::Suld1DArrayV2I64Zero: 3189 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; 3190 break; 3191 case NVPTXISD::Suld1DArrayV4I8Zero: 3192 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; 3193 break; 3194 case NVPTXISD::Suld1DArrayV4I16Zero: 3195 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; 3196 break; 3197 case NVPTXISD::Suld1DArrayV4I32Zero: 3198 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; 3199 break; 3200 case NVPTXISD::Suld2DI8Zero: 3201 Opc = NVPTX::SULD_2D_I8_ZERO_R; 3202 break; 3203 case NVPTXISD::Suld2DI16Zero: 3204 Opc = NVPTX::SULD_2D_I16_ZERO_R; 3205 break; 3206 case NVPTXISD::Suld2DI32Zero: 3207 Opc = NVPTX::SULD_2D_I32_ZERO_R; 3208 break; 3209 case NVPTXISD::Suld2DI64Zero: 3210 Opc = NVPTX::SULD_2D_I64_ZERO_R; 3211 break; 3212 case NVPTXISD::Suld2DV2I8Zero: 3213 Opc = NVPTX::SULD_2D_V2I8_ZERO_R; 3214 break; 3215 case NVPTXISD::Suld2DV2I16Zero: 3216 Opc = NVPTX::SULD_2D_V2I16_ZERO_R; 3217 break; 3218 case NVPTXISD::Suld2DV2I32Zero: 3219 Opc = NVPTX::SULD_2D_V2I32_ZERO_R; 3220 break; 3221 case NVPTXISD::Suld2DV2I64Zero: 3222 Opc = NVPTX::SULD_2D_V2I64_ZERO_R; 3223 break; 3224 case NVPTXISD::Suld2DV4I8Zero: 3225 Opc = NVPTX::SULD_2D_V4I8_ZERO_R; 3226 break; 3227 case NVPTXISD::Suld2DV4I16Zero: 3228 Opc = NVPTX::SULD_2D_V4I16_ZERO_R; 3229 break; 3230 case NVPTXISD::Suld2DV4I32Zero: 3231 Opc = NVPTX::SULD_2D_V4I32_ZERO_R; 3232 break; 3233 case NVPTXISD::Suld2DArrayI8Zero: 3234 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; 3235 break; 3236 case NVPTXISD::Suld2DArrayI16Zero: 3237 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; 3238 break; 3239 case NVPTXISD::Suld2DArrayI32Zero: 3240 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; 3241 break; 3242 case NVPTXISD::Suld2DArrayI64Zero: 3243 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; 3244 break; 3245 case NVPTXISD::Suld2DArrayV2I8Zero: 3246 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; 3247 break; 3248 case NVPTXISD::Suld2DArrayV2I16Zero: 3249 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; 3250 break; 3251 case NVPTXISD::Suld2DArrayV2I32Zero: 3252 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; 3253 break; 3254 case NVPTXISD::Suld2DArrayV2I64Zero: 3255 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; 3256 break; 3257 case NVPTXISD::Suld2DArrayV4I8Zero: 3258 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; 3259 break; 3260 case NVPTXISD::Suld2DArrayV4I16Zero: 3261 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; 3262 break; 3263 case NVPTXISD::Suld2DArrayV4I32Zero: 3264 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; 3265 break; 3266 case NVPTXISD::Suld3DI8Zero: 3267 Opc = NVPTX::SULD_3D_I8_ZERO_R; 3268 break; 3269 case NVPTXISD::Suld3DI16Zero: 3270 Opc = NVPTX::SULD_3D_I16_ZERO_R; 3271 break; 3272 case NVPTXISD::Suld3DI32Zero: 3273 Opc = NVPTX::SULD_3D_I32_ZERO_R; 3274 break; 3275 case NVPTXISD::Suld3DI64Zero: 3276 Opc = NVPTX::SULD_3D_I64_ZERO_R; 3277 break; 3278 case NVPTXISD::Suld3DV2I8Zero: 3279 Opc = NVPTX::SULD_3D_V2I8_ZERO_R; 3280 break; 3281 case NVPTXISD::Suld3DV2I16Zero: 3282 Opc = NVPTX::SULD_3D_V2I16_ZERO_R; 3283 break; 3284 case NVPTXISD::Suld3DV2I32Zero: 3285 Opc = NVPTX::SULD_3D_V2I32_ZERO_R; 3286 break; 3287 case NVPTXISD::Suld3DV2I64Zero: 3288 Opc = NVPTX::SULD_3D_V2I64_ZERO_R; 3289 break; 3290 case NVPTXISD::Suld3DV4I8Zero: 3291 Opc = NVPTX::SULD_3D_V4I8_ZERO_R; 3292 break; 3293 case NVPTXISD::Suld3DV4I16Zero: 3294 Opc = NVPTX::SULD_3D_V4I16_ZERO_R; 3295 break; 3296 case NVPTXISD::Suld3DV4I32Zero: 3297 Opc = NVPTX::SULD_3D_V4I32_ZERO_R; 3298 break; 3299 } 3300 3301 // Copy over operands 3302 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 3303 Ops.push_back(N->getOperand(0)); // Move chain to the back. 3304 3305 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 3306 return true; 3307 } 3308 3309 3310 /// SelectBFE - Look for instruction sequences that can be made more efficient 3311 /// by using the 'bfe' (bit-field extract) PTX instruction 3312 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { 3313 SDLoc DL(N); 3314 SDValue LHS = N->getOperand(0); 3315 SDValue RHS = N->getOperand(1); 3316 SDValue Len; 3317 SDValue Start; 3318 SDValue Val; 3319 bool IsSigned = false; 3320 3321 if (N->getOpcode() == ISD::AND) { 3322 // Canonicalize the operands 3323 // We want 'and %val, %mask' 3324 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { 3325 std::swap(LHS, RHS); 3326 } 3327 3328 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); 3329 if (!Mask) { 3330 // We need a constant mask on the RHS of the AND 3331 return false; 3332 } 3333 3334 // Extract the mask bits 3335 uint64_t MaskVal = Mask->getZExtValue(); 3336 if (!isMask_64(MaskVal)) { 3337 // We *could* handle shifted masks here, but doing so would require an 3338 // 'and' operation to fix up the low-order bits so we would trade 3339 // shr+and for bfe+and, which has the same throughput 3340 return false; 3341 } 3342 3343 // How many bits are in our mask? 3344 int64_t NumBits = countr_one(MaskVal); 3345 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3346 3347 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { 3348 // We have a 'srl/and' pair, extract the effective start bit and length 3349 Val = LHS.getNode()->getOperand(0); 3350 Start = LHS.getNode()->getOperand(1); 3351 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); 3352 if (StartConst) { 3353 uint64_t StartVal = StartConst->getZExtValue(); 3354 // How many "good" bits do we have left? "good" is defined here as bits 3355 // that exist in the original value, not shifted in. 3356 int64_t GoodBits = Start.getValueSizeInBits() - StartVal; 3357 if (NumBits > GoodBits) { 3358 // Do not handle the case where bits have been shifted in. In theory 3359 // we could handle this, but the cost is likely higher than just 3360 // emitting the srl/and pair. 3361 return false; 3362 } 3363 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); 3364 } else { 3365 // Do not handle the case where the shift amount (can be zero if no srl 3366 // was found) is not constant. We could handle this case, but it would 3367 // require run-time logic that would be more expensive than just 3368 // emitting the srl/and pair. 3369 return false; 3370 } 3371 } else { 3372 // Do not handle the case where the LHS of the and is not a shift. While 3373 // it would be trivial to handle this case, it would just transform 3374 // 'and' -> 'bfe', but 'and' has higher-throughput. 3375 return false; 3376 } 3377 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { 3378 if (LHS->getOpcode() == ISD::AND) { 3379 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); 3380 if (!ShiftCnst) { 3381 // Shift amount must be constant 3382 return false; 3383 } 3384 3385 uint64_t ShiftAmt = ShiftCnst->getZExtValue(); 3386 3387 SDValue AndLHS = LHS->getOperand(0); 3388 SDValue AndRHS = LHS->getOperand(1); 3389 3390 // Canonicalize the AND to have the mask on the RHS 3391 if (isa<ConstantSDNode>(AndLHS)) { 3392 std::swap(AndLHS, AndRHS); 3393 } 3394 3395 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); 3396 if (!MaskCnst) { 3397 // Mask must be constant 3398 return false; 3399 } 3400 3401 uint64_t MaskVal = MaskCnst->getZExtValue(); 3402 uint64_t NumZeros; 3403 uint64_t NumBits; 3404 if (isMask_64(MaskVal)) { 3405 NumZeros = 0; 3406 // The number of bits in the result bitfield will be the number of 3407 // trailing ones (the AND) minus the number of bits we shift off 3408 NumBits = llvm::countr_one(MaskVal) - ShiftAmt; 3409 } else if (isShiftedMask_64(MaskVal)) { 3410 NumZeros = llvm::countr_zero(MaskVal); 3411 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros); 3412 // The number of bits in the result bitfield will be the number of 3413 // trailing zeros plus the number of set bits in the mask minus the 3414 // number of bits we shift off 3415 NumBits = NumZeros + NumOnes - ShiftAmt; 3416 } else { 3417 // This is not a mask we can handle 3418 return false; 3419 } 3420 3421 if (ShiftAmt < NumZeros) { 3422 // Handling this case would require extra logic that would make this 3423 // transformation non-profitable 3424 return false; 3425 } 3426 3427 Val = AndLHS; 3428 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); 3429 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3430 } else if (LHS->getOpcode() == ISD::SHL) { 3431 // Here, we have a pattern like: 3432 // 3433 // (sra (shl val, NN), MM) 3434 // or 3435 // (srl (shl val, NN), MM) 3436 // 3437 // If MM >= NN, we can efficiently optimize this with bfe 3438 Val = LHS->getOperand(0); 3439 3440 SDValue ShlRHS = LHS->getOperand(1); 3441 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); 3442 if (!ShlCnst) { 3443 // Shift amount must be constant 3444 return false; 3445 } 3446 uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); 3447 3448 SDValue ShrRHS = RHS; 3449 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); 3450 if (!ShrCnst) { 3451 // Shift amount must be constant 3452 return false; 3453 } 3454 uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); 3455 3456 // To avoid extra codegen and be profitable, we need Outer >= Inner 3457 if (OuterShiftAmt < InnerShiftAmt) { 3458 return false; 3459 } 3460 3461 // If the outer shift is more than the type size, we have no bitfield to 3462 // extract (since we also check that the inner shift is <= the outer shift 3463 // then this also implies that the inner shift is < the type size) 3464 if (OuterShiftAmt >= Val.getValueSizeInBits()) { 3465 return false; 3466 } 3467 3468 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, 3469 MVT::i32); 3470 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, 3471 DL, MVT::i32); 3472 3473 if (N->getOpcode() == ISD::SRA) { 3474 // If we have a arithmetic right shift, we need to use the signed bfe 3475 // variant 3476 IsSigned = true; 3477 } 3478 } else { 3479 // No can do... 3480 return false; 3481 } 3482 } else { 3483 // No can do... 3484 return false; 3485 } 3486 3487 3488 unsigned Opc; 3489 // For the BFE operations we form here from "and" and "srl", always use the 3490 // unsigned variants. 3491 if (Val.getValueType() == MVT::i32) { 3492 if (IsSigned) { 3493 Opc = NVPTX::BFE_S32rii; 3494 } else { 3495 Opc = NVPTX::BFE_U32rii; 3496 } 3497 } else if (Val.getValueType() == MVT::i64) { 3498 if (IsSigned) { 3499 Opc = NVPTX::BFE_S64rii; 3500 } else { 3501 Opc = NVPTX::BFE_U64rii; 3502 } 3503 } else { 3504 // We cannot handle this type 3505 return false; 3506 } 3507 3508 SDValue Ops[] = { 3509 Val, Start, Len 3510 }; 3511 3512 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); 3513 return true; 3514 } 3515 3516 // SelectDirectAddr - Match a direct address for DAG. 3517 // A direct address could be a globaladdress or externalsymbol. 3518 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { 3519 // Return true if TGA or ES. 3520 if (N.getOpcode() == ISD::TargetGlobalAddress || 3521 N.getOpcode() == ISD::TargetExternalSymbol) { 3522 Address = N; 3523 return true; 3524 } 3525 if (N.getOpcode() == NVPTXISD::Wrapper) { 3526 Address = N.getOperand(0); 3527 return true; 3528 } 3529 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol 3530 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) { 3531 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && 3532 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && 3533 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) 3534 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); 3535 } 3536 return false; 3537 } 3538 3539 // symbol+offset 3540 bool NVPTXDAGToDAGISel::SelectADDRsi_imp( 3541 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3542 if (Addr.getOpcode() == ISD::ADD) { 3543 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3544 SDValue base = Addr.getOperand(0); 3545 if (SelectDirectAddr(base, Base)) { 3546 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3547 mvt); 3548 return true; 3549 } 3550 } 3551 } 3552 return false; 3553 } 3554 3555 // symbol+offset 3556 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, 3557 SDValue &Base, SDValue &Offset) { 3558 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); 3559 } 3560 3561 // symbol+offset 3562 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, 3563 SDValue &Base, SDValue &Offset) { 3564 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); 3565 } 3566 3567 // register+offset 3568 bool NVPTXDAGToDAGISel::SelectADDRri_imp( 3569 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3570 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { 3571 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3572 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt); 3573 return true; 3574 } 3575 if (Addr.getOpcode() == ISD::TargetExternalSymbol || 3576 Addr.getOpcode() == ISD::TargetGlobalAddress) 3577 return false; // direct calls. 3578 3579 if (Addr.getOpcode() == ISD::ADD) { 3580 if (SelectDirectAddr(Addr.getOperand(0), Addr)) { 3581 return false; 3582 } 3583 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3584 if (FrameIndexSDNode *FIN = 3585 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) 3586 // Constant offset from frame ref. 3587 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3588 else 3589 Base = Addr.getOperand(0); 3590 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3591 mvt); 3592 return true; 3593 } 3594 } 3595 return false; 3596 } 3597 3598 // register+offset 3599 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, 3600 SDValue &Base, SDValue &Offset) { 3601 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); 3602 } 3603 3604 // register+offset 3605 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, 3606 SDValue &Base, SDValue &Offset) { 3607 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); 3608 } 3609 3610 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, 3611 unsigned int spN) const { 3612 const Value *Src = nullptr; 3613 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { 3614 if (spN == 0 && mN->getMemOperand()->getPseudoValue()) 3615 return true; 3616 Src = mN->getMemOperand()->getValue(); 3617 } 3618 if (!Src) 3619 return false; 3620 if (auto *PT = dyn_cast<PointerType>(Src->getType())) 3621 return (PT->getAddressSpace() == spN); 3622 return false; 3623 } 3624 3625 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 3626 /// inline asm expressions. 3627 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( 3628 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, 3629 std::vector<SDValue> &OutOps) { 3630 SDValue Op0, Op1; 3631 switch (ConstraintID) { 3632 default: 3633 return true; 3634 case InlineAsm::ConstraintCode::m: // memory 3635 if (SelectDirectAddr(Op, Op0)) { 3636 OutOps.push_back(Op0); 3637 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); 3638 return false; 3639 } 3640 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { 3641 OutOps.push_back(Op0); 3642 OutOps.push_back(Op1); 3643 return false; 3644 } 3645 break; 3646 } 3647 return true; 3648 } 3649 3650 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a 3651 /// conversion from \p SrcTy to \p DestTy. 3652 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, 3653 LoadSDNode *LdNode) { 3654 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; 3655 switch (SrcTy.SimpleTy) { 3656 default: 3657 llvm_unreachable("Unhandled source type"); 3658 case MVT::i8: 3659 switch (DestTy.SimpleTy) { 3660 default: 3661 llvm_unreachable("Unhandled dest type"); 3662 case MVT::i16: 3663 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; 3664 case MVT::i32: 3665 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; 3666 case MVT::i64: 3667 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; 3668 } 3669 case MVT::i16: 3670 switch (DestTy.SimpleTy) { 3671 default: 3672 llvm_unreachable("Unhandled dest type"); 3673 case MVT::i8: 3674 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; 3675 case MVT::i32: 3676 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; 3677 case MVT::i64: 3678 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; 3679 } 3680 case MVT::i32: 3681 switch (DestTy.SimpleTy) { 3682 default: 3683 llvm_unreachable("Unhandled dest type"); 3684 case MVT::i8: 3685 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; 3686 case MVT::i16: 3687 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; 3688 case MVT::i64: 3689 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; 3690 } 3691 case MVT::i64: 3692 switch (DestTy.SimpleTy) { 3693 default: 3694 llvm_unreachable("Unhandled dest type"); 3695 case MVT::i8: 3696 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; 3697 case MVT::i16: 3698 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; 3699 case MVT::i32: 3700 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; 3701 } 3702 case MVT::f16: 3703 switch (DestTy.SimpleTy) { 3704 default: 3705 llvm_unreachable("Unhandled dest type"); 3706 case MVT::f32: 3707 return NVPTX::CVT_f32_f16; 3708 case MVT::f64: 3709 return NVPTX::CVT_f64_f16; 3710 } 3711 } 3712 } 3713