1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines an instruction selector for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXISelDAGToDAG.h" 14 #include "MCTargetDesc/NVPTXBaseInfo.h" 15 #include "NVPTXUtilities.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/CodeGen/ISDOpcodes.h" 18 #include "llvm/IR/GlobalValue.h" 19 #include "llvm/IR/Instructions.h" 20 #include "llvm/IR/IntrinsicsNVPTX.h" 21 #include "llvm/Support/AtomicOrdering.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include "llvm/Target/TargetIntrinsicInfo.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "nvptx-isel" 31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" 32 33 /// createNVPTXISelDag - This pass converts a legalized DAG into a 34 /// NVPTX-specific DAG, ready for instruction scheduling. 35 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, 36 llvm::CodeGenOptLevel OptLevel) { 37 return new NVPTXDAGToDAGISel(TM, OptLevel); 38 } 39 40 char NVPTXDAGToDAGISel::ID = 0; 41 42 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) 43 44 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, 45 CodeGenOptLevel OptLevel) 46 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) { 47 doMulWide = (OptLevel > CodeGenOptLevel::None); 48 } 49 50 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 51 Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); 52 return SelectionDAGISel::runOnMachineFunction(MF); 53 } 54 55 int NVPTXDAGToDAGISel::getDivF32Level() const { 56 return Subtarget->getTargetLowering()->getDivF32Level(); 57 } 58 59 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { 60 return Subtarget->getTargetLowering()->usePrecSqrtF32(); 61 } 62 63 bool NVPTXDAGToDAGISel::useF32FTZ() const { 64 return Subtarget->getTargetLowering()->useF32FTZ(*MF); 65 } 66 67 bool NVPTXDAGToDAGISel::allowFMA() const { 68 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 69 return TL->allowFMA(*MF, OptLevel); 70 } 71 72 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { 73 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 74 return TL->allowUnsafeFPMath(*MF); 75 } 76 77 bool NVPTXDAGToDAGISel::useShortPointers() const { 78 return TM.useShortPointers(); 79 } 80 81 /// Select - Select instructions not customized! Used for 82 /// expanded, promoted and normal instructions. 83 void NVPTXDAGToDAGISel::Select(SDNode *N) { 84 85 if (N->isMachineOpcode()) { 86 N->setNodeId(-1); 87 return; // Already selected. 88 } 89 90 switch (N->getOpcode()) { 91 case ISD::LOAD: 92 case ISD::ATOMIC_LOAD: 93 if (tryLoad(N)) 94 return; 95 break; 96 case ISD::STORE: 97 case ISD::ATOMIC_STORE: 98 if (tryStore(N)) 99 return; 100 break; 101 case ISD::EXTRACT_VECTOR_ELT: 102 if (tryEXTRACT_VECTOR_ELEMENT(N)) 103 return; 104 break; 105 case NVPTXISD::SETP_F16X2: 106 SelectSETP_F16X2(N); 107 return; 108 case NVPTXISD::SETP_BF16X2: 109 SelectSETP_BF16X2(N); 110 return; 111 case NVPTXISD::LoadV2: 112 case NVPTXISD::LoadV4: 113 if (tryLoadVector(N)) 114 return; 115 break; 116 case NVPTXISD::LDGV2: 117 case NVPTXISD::LDGV4: 118 case NVPTXISD::LDUV2: 119 case NVPTXISD::LDUV4: 120 if (tryLDGLDU(N)) 121 return; 122 break; 123 case NVPTXISD::StoreV2: 124 case NVPTXISD::StoreV4: 125 if (tryStoreVector(N)) 126 return; 127 break; 128 case NVPTXISD::LoadParam: 129 case NVPTXISD::LoadParamV2: 130 case NVPTXISD::LoadParamV4: 131 if (tryLoadParam(N)) 132 return; 133 break; 134 case NVPTXISD::StoreRetval: 135 case NVPTXISD::StoreRetvalV2: 136 case NVPTXISD::StoreRetvalV4: 137 if (tryStoreRetval(N)) 138 return; 139 break; 140 case NVPTXISD::StoreParam: 141 case NVPTXISD::StoreParamV2: 142 case NVPTXISD::StoreParamV4: 143 case NVPTXISD::StoreParamS32: 144 case NVPTXISD::StoreParamU32: 145 if (tryStoreParam(N)) 146 return; 147 break; 148 case ISD::INTRINSIC_WO_CHAIN: 149 if (tryIntrinsicNoChain(N)) 150 return; 151 break; 152 case ISD::INTRINSIC_W_CHAIN: 153 if (tryIntrinsicChain(N)) 154 return; 155 break; 156 case NVPTXISD::Tex1DFloatS32: 157 case NVPTXISD::Tex1DFloatFloat: 158 case NVPTXISD::Tex1DFloatFloatLevel: 159 case NVPTXISD::Tex1DFloatFloatGrad: 160 case NVPTXISD::Tex1DS32S32: 161 case NVPTXISD::Tex1DS32Float: 162 case NVPTXISD::Tex1DS32FloatLevel: 163 case NVPTXISD::Tex1DS32FloatGrad: 164 case NVPTXISD::Tex1DU32S32: 165 case NVPTXISD::Tex1DU32Float: 166 case NVPTXISD::Tex1DU32FloatLevel: 167 case NVPTXISD::Tex1DU32FloatGrad: 168 case NVPTXISD::Tex1DArrayFloatS32: 169 case NVPTXISD::Tex1DArrayFloatFloat: 170 case NVPTXISD::Tex1DArrayFloatFloatLevel: 171 case NVPTXISD::Tex1DArrayFloatFloatGrad: 172 case NVPTXISD::Tex1DArrayS32S32: 173 case NVPTXISD::Tex1DArrayS32Float: 174 case NVPTXISD::Tex1DArrayS32FloatLevel: 175 case NVPTXISD::Tex1DArrayS32FloatGrad: 176 case NVPTXISD::Tex1DArrayU32S32: 177 case NVPTXISD::Tex1DArrayU32Float: 178 case NVPTXISD::Tex1DArrayU32FloatLevel: 179 case NVPTXISD::Tex1DArrayU32FloatGrad: 180 case NVPTXISD::Tex2DFloatS32: 181 case NVPTXISD::Tex2DFloatFloat: 182 case NVPTXISD::Tex2DFloatFloatLevel: 183 case NVPTXISD::Tex2DFloatFloatGrad: 184 case NVPTXISD::Tex2DS32S32: 185 case NVPTXISD::Tex2DS32Float: 186 case NVPTXISD::Tex2DS32FloatLevel: 187 case NVPTXISD::Tex2DS32FloatGrad: 188 case NVPTXISD::Tex2DU32S32: 189 case NVPTXISD::Tex2DU32Float: 190 case NVPTXISD::Tex2DU32FloatLevel: 191 case NVPTXISD::Tex2DU32FloatGrad: 192 case NVPTXISD::Tex2DArrayFloatS32: 193 case NVPTXISD::Tex2DArrayFloatFloat: 194 case NVPTXISD::Tex2DArrayFloatFloatLevel: 195 case NVPTXISD::Tex2DArrayFloatFloatGrad: 196 case NVPTXISD::Tex2DArrayS32S32: 197 case NVPTXISD::Tex2DArrayS32Float: 198 case NVPTXISD::Tex2DArrayS32FloatLevel: 199 case NVPTXISD::Tex2DArrayS32FloatGrad: 200 case NVPTXISD::Tex2DArrayU32S32: 201 case NVPTXISD::Tex2DArrayU32Float: 202 case NVPTXISD::Tex2DArrayU32FloatLevel: 203 case NVPTXISD::Tex2DArrayU32FloatGrad: 204 case NVPTXISD::Tex3DFloatS32: 205 case NVPTXISD::Tex3DFloatFloat: 206 case NVPTXISD::Tex3DFloatFloatLevel: 207 case NVPTXISD::Tex3DFloatFloatGrad: 208 case NVPTXISD::Tex3DS32S32: 209 case NVPTXISD::Tex3DS32Float: 210 case NVPTXISD::Tex3DS32FloatLevel: 211 case NVPTXISD::Tex3DS32FloatGrad: 212 case NVPTXISD::Tex3DU32S32: 213 case NVPTXISD::Tex3DU32Float: 214 case NVPTXISD::Tex3DU32FloatLevel: 215 case NVPTXISD::Tex3DU32FloatGrad: 216 case NVPTXISD::TexCubeFloatFloat: 217 case NVPTXISD::TexCubeFloatFloatLevel: 218 case NVPTXISD::TexCubeS32Float: 219 case NVPTXISD::TexCubeS32FloatLevel: 220 case NVPTXISD::TexCubeU32Float: 221 case NVPTXISD::TexCubeU32FloatLevel: 222 case NVPTXISD::TexCubeArrayFloatFloat: 223 case NVPTXISD::TexCubeArrayFloatFloatLevel: 224 case NVPTXISD::TexCubeArrayS32Float: 225 case NVPTXISD::TexCubeArrayS32FloatLevel: 226 case NVPTXISD::TexCubeArrayU32Float: 227 case NVPTXISD::TexCubeArrayU32FloatLevel: 228 case NVPTXISD::Tld4R2DFloatFloat: 229 case NVPTXISD::Tld4G2DFloatFloat: 230 case NVPTXISD::Tld4B2DFloatFloat: 231 case NVPTXISD::Tld4A2DFloatFloat: 232 case NVPTXISD::Tld4R2DS64Float: 233 case NVPTXISD::Tld4G2DS64Float: 234 case NVPTXISD::Tld4B2DS64Float: 235 case NVPTXISD::Tld4A2DS64Float: 236 case NVPTXISD::Tld4R2DU64Float: 237 case NVPTXISD::Tld4G2DU64Float: 238 case NVPTXISD::Tld4B2DU64Float: 239 case NVPTXISD::Tld4A2DU64Float: 240 case NVPTXISD::TexUnified1DFloatS32: 241 case NVPTXISD::TexUnified1DFloatFloat: 242 case NVPTXISD::TexUnified1DFloatFloatLevel: 243 case NVPTXISD::TexUnified1DFloatFloatGrad: 244 case NVPTXISD::TexUnified1DS32S32: 245 case NVPTXISD::TexUnified1DS32Float: 246 case NVPTXISD::TexUnified1DS32FloatLevel: 247 case NVPTXISD::TexUnified1DS32FloatGrad: 248 case NVPTXISD::TexUnified1DU32S32: 249 case NVPTXISD::TexUnified1DU32Float: 250 case NVPTXISD::TexUnified1DU32FloatLevel: 251 case NVPTXISD::TexUnified1DU32FloatGrad: 252 case NVPTXISD::TexUnified1DArrayFloatS32: 253 case NVPTXISD::TexUnified1DArrayFloatFloat: 254 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 255 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 256 case NVPTXISD::TexUnified1DArrayS32S32: 257 case NVPTXISD::TexUnified1DArrayS32Float: 258 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 259 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 260 case NVPTXISD::TexUnified1DArrayU32S32: 261 case NVPTXISD::TexUnified1DArrayU32Float: 262 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 263 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 264 case NVPTXISD::TexUnified2DFloatS32: 265 case NVPTXISD::TexUnified2DFloatFloat: 266 case NVPTXISD::TexUnified2DFloatFloatLevel: 267 case NVPTXISD::TexUnified2DFloatFloatGrad: 268 case NVPTXISD::TexUnified2DS32S32: 269 case NVPTXISD::TexUnified2DS32Float: 270 case NVPTXISD::TexUnified2DS32FloatLevel: 271 case NVPTXISD::TexUnified2DS32FloatGrad: 272 case NVPTXISD::TexUnified2DU32S32: 273 case NVPTXISD::TexUnified2DU32Float: 274 case NVPTXISD::TexUnified2DU32FloatLevel: 275 case NVPTXISD::TexUnified2DU32FloatGrad: 276 case NVPTXISD::TexUnified2DArrayFloatS32: 277 case NVPTXISD::TexUnified2DArrayFloatFloat: 278 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 279 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 280 case NVPTXISD::TexUnified2DArrayS32S32: 281 case NVPTXISD::TexUnified2DArrayS32Float: 282 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 283 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 284 case NVPTXISD::TexUnified2DArrayU32S32: 285 case NVPTXISD::TexUnified2DArrayU32Float: 286 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 287 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 288 case NVPTXISD::TexUnified3DFloatS32: 289 case NVPTXISD::TexUnified3DFloatFloat: 290 case NVPTXISD::TexUnified3DFloatFloatLevel: 291 case NVPTXISD::TexUnified3DFloatFloatGrad: 292 case NVPTXISD::TexUnified3DS32S32: 293 case NVPTXISD::TexUnified3DS32Float: 294 case NVPTXISD::TexUnified3DS32FloatLevel: 295 case NVPTXISD::TexUnified3DS32FloatGrad: 296 case NVPTXISD::TexUnified3DU32S32: 297 case NVPTXISD::TexUnified3DU32Float: 298 case NVPTXISD::TexUnified3DU32FloatLevel: 299 case NVPTXISD::TexUnified3DU32FloatGrad: 300 case NVPTXISD::TexUnifiedCubeFloatFloat: 301 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 302 case NVPTXISD::TexUnifiedCubeS32Float: 303 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 304 case NVPTXISD::TexUnifiedCubeU32Float: 305 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 306 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 307 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 308 case NVPTXISD::TexUnifiedCubeArrayS32Float: 309 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 310 case NVPTXISD::TexUnifiedCubeArrayU32Float: 311 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 312 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 313 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 314 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 315 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 316 case NVPTXISD::Tld4UnifiedR2DS64Float: 317 case NVPTXISD::Tld4UnifiedG2DS64Float: 318 case NVPTXISD::Tld4UnifiedB2DS64Float: 319 case NVPTXISD::Tld4UnifiedA2DS64Float: 320 case NVPTXISD::Tld4UnifiedR2DU64Float: 321 case NVPTXISD::Tld4UnifiedG2DU64Float: 322 case NVPTXISD::Tld4UnifiedB2DU64Float: 323 case NVPTXISD::Tld4UnifiedA2DU64Float: 324 if (tryTextureIntrinsic(N)) 325 return; 326 break; 327 case NVPTXISD::Suld1DI8Clamp: 328 case NVPTXISD::Suld1DI16Clamp: 329 case NVPTXISD::Suld1DI32Clamp: 330 case NVPTXISD::Suld1DI64Clamp: 331 case NVPTXISD::Suld1DV2I8Clamp: 332 case NVPTXISD::Suld1DV2I16Clamp: 333 case NVPTXISD::Suld1DV2I32Clamp: 334 case NVPTXISD::Suld1DV2I64Clamp: 335 case NVPTXISD::Suld1DV4I8Clamp: 336 case NVPTXISD::Suld1DV4I16Clamp: 337 case NVPTXISD::Suld1DV4I32Clamp: 338 case NVPTXISD::Suld1DArrayI8Clamp: 339 case NVPTXISD::Suld1DArrayI16Clamp: 340 case NVPTXISD::Suld1DArrayI32Clamp: 341 case NVPTXISD::Suld1DArrayI64Clamp: 342 case NVPTXISD::Suld1DArrayV2I8Clamp: 343 case NVPTXISD::Suld1DArrayV2I16Clamp: 344 case NVPTXISD::Suld1DArrayV2I32Clamp: 345 case NVPTXISD::Suld1DArrayV2I64Clamp: 346 case NVPTXISD::Suld1DArrayV4I8Clamp: 347 case NVPTXISD::Suld1DArrayV4I16Clamp: 348 case NVPTXISD::Suld1DArrayV4I32Clamp: 349 case NVPTXISD::Suld2DI8Clamp: 350 case NVPTXISD::Suld2DI16Clamp: 351 case NVPTXISD::Suld2DI32Clamp: 352 case NVPTXISD::Suld2DI64Clamp: 353 case NVPTXISD::Suld2DV2I8Clamp: 354 case NVPTXISD::Suld2DV2I16Clamp: 355 case NVPTXISD::Suld2DV2I32Clamp: 356 case NVPTXISD::Suld2DV2I64Clamp: 357 case NVPTXISD::Suld2DV4I8Clamp: 358 case NVPTXISD::Suld2DV4I16Clamp: 359 case NVPTXISD::Suld2DV4I32Clamp: 360 case NVPTXISD::Suld2DArrayI8Clamp: 361 case NVPTXISD::Suld2DArrayI16Clamp: 362 case NVPTXISD::Suld2DArrayI32Clamp: 363 case NVPTXISD::Suld2DArrayI64Clamp: 364 case NVPTXISD::Suld2DArrayV2I8Clamp: 365 case NVPTXISD::Suld2DArrayV2I16Clamp: 366 case NVPTXISD::Suld2DArrayV2I32Clamp: 367 case NVPTXISD::Suld2DArrayV2I64Clamp: 368 case NVPTXISD::Suld2DArrayV4I8Clamp: 369 case NVPTXISD::Suld2DArrayV4I16Clamp: 370 case NVPTXISD::Suld2DArrayV4I32Clamp: 371 case NVPTXISD::Suld3DI8Clamp: 372 case NVPTXISD::Suld3DI16Clamp: 373 case NVPTXISD::Suld3DI32Clamp: 374 case NVPTXISD::Suld3DI64Clamp: 375 case NVPTXISD::Suld3DV2I8Clamp: 376 case NVPTXISD::Suld3DV2I16Clamp: 377 case NVPTXISD::Suld3DV2I32Clamp: 378 case NVPTXISD::Suld3DV2I64Clamp: 379 case NVPTXISD::Suld3DV4I8Clamp: 380 case NVPTXISD::Suld3DV4I16Clamp: 381 case NVPTXISD::Suld3DV4I32Clamp: 382 case NVPTXISD::Suld1DI8Trap: 383 case NVPTXISD::Suld1DI16Trap: 384 case NVPTXISD::Suld1DI32Trap: 385 case NVPTXISD::Suld1DI64Trap: 386 case NVPTXISD::Suld1DV2I8Trap: 387 case NVPTXISD::Suld1DV2I16Trap: 388 case NVPTXISD::Suld1DV2I32Trap: 389 case NVPTXISD::Suld1DV2I64Trap: 390 case NVPTXISD::Suld1DV4I8Trap: 391 case NVPTXISD::Suld1DV4I16Trap: 392 case NVPTXISD::Suld1DV4I32Trap: 393 case NVPTXISD::Suld1DArrayI8Trap: 394 case NVPTXISD::Suld1DArrayI16Trap: 395 case NVPTXISD::Suld1DArrayI32Trap: 396 case NVPTXISD::Suld1DArrayI64Trap: 397 case NVPTXISD::Suld1DArrayV2I8Trap: 398 case NVPTXISD::Suld1DArrayV2I16Trap: 399 case NVPTXISD::Suld1DArrayV2I32Trap: 400 case NVPTXISD::Suld1DArrayV2I64Trap: 401 case NVPTXISD::Suld1DArrayV4I8Trap: 402 case NVPTXISD::Suld1DArrayV4I16Trap: 403 case NVPTXISD::Suld1DArrayV4I32Trap: 404 case NVPTXISD::Suld2DI8Trap: 405 case NVPTXISD::Suld2DI16Trap: 406 case NVPTXISD::Suld2DI32Trap: 407 case NVPTXISD::Suld2DI64Trap: 408 case NVPTXISD::Suld2DV2I8Trap: 409 case NVPTXISD::Suld2DV2I16Trap: 410 case NVPTXISD::Suld2DV2I32Trap: 411 case NVPTXISD::Suld2DV2I64Trap: 412 case NVPTXISD::Suld2DV4I8Trap: 413 case NVPTXISD::Suld2DV4I16Trap: 414 case NVPTXISD::Suld2DV4I32Trap: 415 case NVPTXISD::Suld2DArrayI8Trap: 416 case NVPTXISD::Suld2DArrayI16Trap: 417 case NVPTXISD::Suld2DArrayI32Trap: 418 case NVPTXISD::Suld2DArrayI64Trap: 419 case NVPTXISD::Suld2DArrayV2I8Trap: 420 case NVPTXISD::Suld2DArrayV2I16Trap: 421 case NVPTXISD::Suld2DArrayV2I32Trap: 422 case NVPTXISD::Suld2DArrayV2I64Trap: 423 case NVPTXISD::Suld2DArrayV4I8Trap: 424 case NVPTXISD::Suld2DArrayV4I16Trap: 425 case NVPTXISD::Suld2DArrayV4I32Trap: 426 case NVPTXISD::Suld3DI8Trap: 427 case NVPTXISD::Suld3DI16Trap: 428 case NVPTXISD::Suld3DI32Trap: 429 case NVPTXISD::Suld3DI64Trap: 430 case NVPTXISD::Suld3DV2I8Trap: 431 case NVPTXISD::Suld3DV2I16Trap: 432 case NVPTXISD::Suld3DV2I32Trap: 433 case NVPTXISD::Suld3DV2I64Trap: 434 case NVPTXISD::Suld3DV4I8Trap: 435 case NVPTXISD::Suld3DV4I16Trap: 436 case NVPTXISD::Suld3DV4I32Trap: 437 case NVPTXISD::Suld1DI8Zero: 438 case NVPTXISD::Suld1DI16Zero: 439 case NVPTXISD::Suld1DI32Zero: 440 case NVPTXISD::Suld1DI64Zero: 441 case NVPTXISD::Suld1DV2I8Zero: 442 case NVPTXISD::Suld1DV2I16Zero: 443 case NVPTXISD::Suld1DV2I32Zero: 444 case NVPTXISD::Suld1DV2I64Zero: 445 case NVPTXISD::Suld1DV4I8Zero: 446 case NVPTXISD::Suld1DV4I16Zero: 447 case NVPTXISD::Suld1DV4I32Zero: 448 case NVPTXISD::Suld1DArrayI8Zero: 449 case NVPTXISD::Suld1DArrayI16Zero: 450 case NVPTXISD::Suld1DArrayI32Zero: 451 case NVPTXISD::Suld1DArrayI64Zero: 452 case NVPTXISD::Suld1DArrayV2I8Zero: 453 case NVPTXISD::Suld1DArrayV2I16Zero: 454 case NVPTXISD::Suld1DArrayV2I32Zero: 455 case NVPTXISD::Suld1DArrayV2I64Zero: 456 case NVPTXISD::Suld1DArrayV4I8Zero: 457 case NVPTXISD::Suld1DArrayV4I16Zero: 458 case NVPTXISD::Suld1DArrayV4I32Zero: 459 case NVPTXISD::Suld2DI8Zero: 460 case NVPTXISD::Suld2DI16Zero: 461 case NVPTXISD::Suld2DI32Zero: 462 case NVPTXISD::Suld2DI64Zero: 463 case NVPTXISD::Suld2DV2I8Zero: 464 case NVPTXISD::Suld2DV2I16Zero: 465 case NVPTXISD::Suld2DV2I32Zero: 466 case NVPTXISD::Suld2DV2I64Zero: 467 case NVPTXISD::Suld2DV4I8Zero: 468 case NVPTXISD::Suld2DV4I16Zero: 469 case NVPTXISD::Suld2DV4I32Zero: 470 case NVPTXISD::Suld2DArrayI8Zero: 471 case NVPTXISD::Suld2DArrayI16Zero: 472 case NVPTXISD::Suld2DArrayI32Zero: 473 case NVPTXISD::Suld2DArrayI64Zero: 474 case NVPTXISD::Suld2DArrayV2I8Zero: 475 case NVPTXISD::Suld2DArrayV2I16Zero: 476 case NVPTXISD::Suld2DArrayV2I32Zero: 477 case NVPTXISD::Suld2DArrayV2I64Zero: 478 case NVPTXISD::Suld2DArrayV4I8Zero: 479 case NVPTXISD::Suld2DArrayV4I16Zero: 480 case NVPTXISD::Suld2DArrayV4I32Zero: 481 case NVPTXISD::Suld3DI8Zero: 482 case NVPTXISD::Suld3DI16Zero: 483 case NVPTXISD::Suld3DI32Zero: 484 case NVPTXISD::Suld3DI64Zero: 485 case NVPTXISD::Suld3DV2I8Zero: 486 case NVPTXISD::Suld3DV2I16Zero: 487 case NVPTXISD::Suld3DV2I32Zero: 488 case NVPTXISD::Suld3DV2I64Zero: 489 case NVPTXISD::Suld3DV4I8Zero: 490 case NVPTXISD::Suld3DV4I16Zero: 491 case NVPTXISD::Suld3DV4I32Zero: 492 if (trySurfaceIntrinsic(N)) 493 return; 494 break; 495 case ISD::AND: 496 case ISD::SRA: 497 case ISD::SRL: 498 // Try to select BFE 499 if (tryBFE(N)) 500 return; 501 break; 502 case ISD::ADDRSPACECAST: 503 SelectAddrSpaceCast(N); 504 return; 505 case ISD::ConstantFP: 506 if (tryConstantFP(N)) 507 return; 508 break; 509 default: 510 break; 511 } 512 SelectCode(N); 513 } 514 515 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { 516 unsigned IID = N->getConstantOperandVal(1); 517 switch (IID) { 518 default: 519 return false; 520 case Intrinsic::nvvm_ldg_global_f: 521 case Intrinsic::nvvm_ldg_global_i: 522 case Intrinsic::nvvm_ldg_global_p: 523 case Intrinsic::nvvm_ldu_global_f: 524 case Intrinsic::nvvm_ldu_global_i: 525 case Intrinsic::nvvm_ldu_global_p: 526 return tryLDGLDU(N); 527 } 528 } 529 530 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we 531 // have to load them into an .(b)f16 register first. 532 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { 533 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16) 534 return false; 535 SDValue Val = CurDAG->getTargetConstantFP( 536 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0)); 537 SDNode *LoadConstF16 = CurDAG->getMachineNode( 538 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 539 : NVPTX::LOAD_CONST_BF16), 540 SDLoc(N), N->getValueType(0), Val); 541 ReplaceNode(N, LoadConstF16); 542 return true; 543 } 544 545 // Map ISD:CONDCODE value to appropriate CmpMode expected by 546 // NVPTXInstPrinter::printCmpMode() 547 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { 548 using NVPTX::PTXCmpMode::CmpMode; 549 unsigned PTXCmpMode = [](ISD::CondCode CC) { 550 switch (CC) { 551 default: 552 llvm_unreachable("Unexpected condition code."); 553 case ISD::SETOEQ: 554 return CmpMode::EQ; 555 case ISD::SETOGT: 556 return CmpMode::GT; 557 case ISD::SETOGE: 558 return CmpMode::GE; 559 case ISD::SETOLT: 560 return CmpMode::LT; 561 case ISD::SETOLE: 562 return CmpMode::LE; 563 case ISD::SETONE: 564 return CmpMode::NE; 565 case ISD::SETO: 566 return CmpMode::NUM; 567 case ISD::SETUO: 568 return CmpMode::NotANumber; 569 case ISD::SETUEQ: 570 return CmpMode::EQU; 571 case ISD::SETUGT: 572 return CmpMode::GTU; 573 case ISD::SETUGE: 574 return CmpMode::GEU; 575 case ISD::SETULT: 576 return CmpMode::LTU; 577 case ISD::SETULE: 578 return CmpMode::LEU; 579 case ISD::SETUNE: 580 return CmpMode::NEU; 581 case ISD::SETEQ: 582 return CmpMode::EQ; 583 case ISD::SETGT: 584 return CmpMode::GT; 585 case ISD::SETGE: 586 return CmpMode::GE; 587 case ISD::SETLT: 588 return CmpMode::LT; 589 case ISD::SETLE: 590 return CmpMode::LE; 591 case ISD::SETNE: 592 return CmpMode::NE; 593 } 594 }(CondCode.get()); 595 596 if (FTZ) 597 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; 598 599 return PTXCmpMode; 600 } 601 602 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { 603 unsigned PTXCmpMode = 604 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 605 SDLoc DL(N); 606 SDNode *SetP = CurDAG->getMachineNode( 607 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 608 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 609 ReplaceNode(N, SetP); 610 return true; 611 } 612 613 bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { 614 unsigned PTXCmpMode = 615 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 616 SDLoc DL(N); 617 SDNode *SetP = CurDAG->getMachineNode( 618 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 619 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 620 ReplaceNode(N, SetP); 621 return true; 622 } 623 624 // Find all instances of extract_vector_elt that use this v2f16 vector 625 // and coalesce them into a scattering move instruction. 626 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { 627 SDValue Vector = N->getOperand(0); 628 629 // We only care about 16x2 as it's the only real vector type we 630 // need to deal with. 631 MVT VT = Vector.getSimpleValueType(); 632 if (!Isv2x16VT(VT)) 633 return false; 634 // Find and record all uses of this vector that extract element 0 or 1. 635 SmallVector<SDNode *, 4> E0, E1; 636 for (auto *U : Vector.getNode()->uses()) { 637 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 638 continue; 639 if (U->getOperand(0) != Vector) 640 continue; 641 if (const ConstantSDNode *IdxConst = 642 dyn_cast<ConstantSDNode>(U->getOperand(1))) { 643 if (IdxConst->getZExtValue() == 0) 644 E0.push_back(U); 645 else if (IdxConst->getZExtValue() == 1) 646 E1.push_back(U); 647 else 648 llvm_unreachable("Invalid vector index."); 649 } 650 } 651 652 // There's no point scattering f16x2 if we only ever access one 653 // element of it. 654 if (E0.empty() || E1.empty()) 655 return false; 656 657 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) 658 // into f16,f16 SplitF16x2(V) 659 MVT EltVT = VT.getVectorElementType(); 660 SDNode *ScatterOp = 661 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); 662 for (auto *Node : E0) 663 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); 664 for (auto *Node : E1) 665 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); 666 667 return true; 668 } 669 670 static unsigned int getCodeAddrSpace(MemSDNode *N) { 671 const Value *Src = N->getMemOperand()->getValue(); 672 673 if (!Src) 674 return NVPTX::PTXLdStInstCode::GENERIC; 675 676 if (auto *PT = dyn_cast<PointerType>(Src->getType())) { 677 switch (PT->getAddressSpace()) { 678 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; 679 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; 680 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; 681 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; 682 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; 683 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; 684 default: break; 685 } 686 } 687 return NVPTX::PTXLdStInstCode::GENERIC; 688 } 689 690 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, 691 unsigned CodeAddrSpace, MachineFunction *F) { 692 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address 693 // space. 694 // 695 // We have two ways of identifying invariant loads: Loads may be explicitly 696 // marked as invariant, or we may infer them to be invariant. 697 // 698 // We currently infer invariance for loads from 699 // - constant global variables, and 700 // - kernel function pointer params that are noalias (i.e. __restrict) and 701 // never written to. 702 // 703 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally 704 // not during the SelectionDAG phase). 705 // 706 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for 707 // explicitly invariant loads because these are how clang tells us to use ldg 708 // when the user uses a builtin. 709 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) 710 return false; 711 712 if (N->isInvariant()) 713 return true; 714 715 bool IsKernelFn = isKernelFunction(F->getFunction()); 716 717 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly 718 // because the former looks through phi nodes while the latter does not. We 719 // need to look through phi nodes to handle pointer induction variables. 720 SmallVector<const Value *, 8> Objs; 721 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); 722 723 return all_of(Objs, [&](const Value *V) { 724 if (auto *A = dyn_cast<const Argument>(V)) 725 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); 726 if (auto *GV = dyn_cast<const GlobalVariable>(V)) 727 return GV->isConstant(); 728 return false; 729 }); 730 } 731 732 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { 733 unsigned IID = N->getConstantOperandVal(0); 734 switch (IID) { 735 default: 736 return false; 737 case Intrinsic::nvvm_texsurf_handle_internal: 738 SelectTexSurfHandle(N); 739 return true; 740 } 741 } 742 743 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { 744 // Op 0 is the intrinsic ID 745 SDValue Wrapper = N->getOperand(1); 746 SDValue GlobalVal = Wrapper.getOperand(0); 747 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), 748 MVT::i64, GlobalVal)); 749 } 750 751 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { 752 SDValue Src = N->getOperand(0); 753 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); 754 unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); 755 unsigned DstAddrSpace = CastN->getDestAddressSpace(); 756 assert(SrcAddrSpace != DstAddrSpace && 757 "addrspacecast must be between different address spaces"); 758 759 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { 760 // Specific to generic 761 unsigned Opc; 762 switch (SrcAddrSpace) { 763 default: report_fatal_error("Bad address space in addrspacecast"); 764 case ADDRESS_SPACE_GLOBAL: 765 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes; 766 break; 767 case ADDRESS_SPACE_SHARED: 768 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432 769 : NVPTX::cvta_shared_yes_64) 770 : NVPTX::cvta_shared_yes; 771 break; 772 case ADDRESS_SPACE_CONST: 773 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432 774 : NVPTX::cvta_const_yes_64) 775 : NVPTX::cvta_const_yes; 776 break; 777 case ADDRESS_SPACE_LOCAL: 778 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432 779 : NVPTX::cvta_local_yes_64) 780 : NVPTX::cvta_local_yes; 781 break; 782 } 783 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 784 Src)); 785 return; 786 } else { 787 // Generic to specific 788 if (SrcAddrSpace != 0) 789 report_fatal_error("Cannot cast between two non-generic address spaces"); 790 unsigned Opc; 791 switch (DstAddrSpace) { 792 default: report_fatal_error("Bad address space in addrspacecast"); 793 case ADDRESS_SPACE_GLOBAL: 794 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64 795 : NVPTX::cvta_to_global_yes; 796 break; 797 case ADDRESS_SPACE_SHARED: 798 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264 799 : NVPTX::cvta_to_shared_yes_64) 800 : NVPTX::cvta_to_shared_yes; 801 break; 802 case ADDRESS_SPACE_CONST: 803 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264 804 : NVPTX::cvta_to_const_yes_64) 805 : NVPTX::cvta_to_const_yes; 806 break; 807 case ADDRESS_SPACE_LOCAL: 808 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264 809 : NVPTX::cvta_to_local_yes_64) 810 : NVPTX::cvta_to_local_yes; 811 break; 812 case ADDRESS_SPACE_PARAM: 813 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 814 : NVPTX::nvvm_ptr_gen_to_param; 815 break; 816 } 817 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 818 Src)); 819 return; 820 } 821 } 822 823 // Helper function template to reduce amount of boilerplate code for 824 // opcode selection. 825 static std::optional<unsigned> 826 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, 827 unsigned Opcode_i16, unsigned Opcode_i32, 828 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, 829 std::optional<unsigned> Opcode_f64) { 830 switch (VT) { 831 case MVT::i1: 832 case MVT::i8: 833 return Opcode_i8; 834 case MVT::i16: 835 return Opcode_i16; 836 case MVT::i32: 837 return Opcode_i32; 838 case MVT::i64: 839 return Opcode_i64; 840 case MVT::f16: 841 case MVT::bf16: 842 return Opcode_i16; 843 case MVT::v2f16: 844 case MVT::v2bf16: 845 case MVT::v2i16: 846 case MVT::v4i8: 847 return Opcode_i32; 848 case MVT::f32: 849 return Opcode_f32; 850 case MVT::f64: 851 return Opcode_f64; 852 default: 853 return std::nullopt; 854 } 855 } 856 857 static int getLdStRegType(EVT VT) { 858 if (VT.isFloatingPoint()) 859 switch (VT.getSimpleVT().SimpleTy) { 860 case MVT::f16: 861 case MVT::bf16: 862 case MVT::v2f16: 863 case MVT::v2bf16: 864 return NVPTX::PTXLdStInstCode::Untyped; 865 default: 866 return NVPTX::PTXLdStInstCode::Float; 867 } 868 else 869 return NVPTX::PTXLdStInstCode::Unsigned; 870 } 871 872 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { 873 SDLoc dl(N); 874 MemSDNode *LD = cast<MemSDNode>(N); 875 assert(LD->readMem() && "Expected load"); 876 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); 877 EVT LoadedVT = LD->getMemoryVT(); 878 SDNode *NVPTXLD = nullptr; 879 880 // do not support pre/post inc/dec 881 if (PlainLoad && PlainLoad->isIndexed()) 882 return false; 883 884 if (!LoadedVT.isSimple()) 885 return false; 886 887 AtomicOrdering Ordering = LD->getSuccessOrdering(); 888 // In order to lower atomic loads with stronger guarantees we would need to 889 // use load.acquire or insert fences. However these features were only added 890 // with PTX ISA 6.0 / sm_70. 891 // TODO: Check if we can actually use the new instructions and implement them. 892 if (isStrongerThanMonotonic(Ordering)) 893 return false; 894 895 // Address Space Setting 896 unsigned int CodeAddrSpace = getCodeAddrSpace(LD); 897 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { 898 return tryLDGLDU(N); 899 } 900 901 unsigned int PointerSize = 902 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); 903 904 // Volatile Setting 905 // - .volatile is only available for .global and .shared 906 // - .volatile has the same memory synchronization semantics as .relaxed.sys 907 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; 908 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 909 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 910 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 911 isVolatile = false; 912 913 // Type Setting: fromType + fromTypeWidth 914 // 915 // Sign : ISD::SEXTLOAD 916 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 917 // type is integer 918 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 919 MVT SimpleVT = LoadedVT.getSimpleVT(); 920 MVT ScalarVT = SimpleVT.getScalarType(); 921 // Read at least 8 bits (predicates are stored as 8-bit values) 922 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 923 unsigned int fromType; 924 925 // Vector Setting 926 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 927 if (SimpleVT.isVector()) { 928 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && 929 "Unexpected vector type"); 930 // v2f16/v2bf16/v2i16 is loaded using ld.b32 931 fromTypeWidth = 32; 932 } 933 934 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) 935 fromType = NVPTX::PTXLdStInstCode::Signed; 936 else 937 fromType = getLdStRegType(ScalarVT); 938 939 // Create the machine instruction DAG 940 SDValue Chain = N->getOperand(0); 941 SDValue N1 = N->getOperand(1); 942 SDValue Addr; 943 SDValue Offset, Base; 944 std::optional<unsigned> Opcode; 945 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; 946 947 if (SelectDirectAddr(N1, Addr)) { 948 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, 949 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, 950 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); 951 if (!Opcode) 952 return false; 953 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 954 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 955 getI32Imm(fromTypeWidth, dl), Addr, Chain }; 956 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 957 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) 958 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { 959 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, 960 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, 961 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); 962 if (!Opcode) 963 return false; 964 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 965 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 966 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 967 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 968 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) 969 : SelectADDRri(N1.getNode(), N1, Base, Offset)) { 970 if (PointerSize == 64) 971 Opcode = 972 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, 973 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, 974 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); 975 else 976 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, 977 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari, 978 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); 979 if (!Opcode) 980 return false; 981 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 982 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 983 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 984 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 985 } else { 986 if (PointerSize == 64) 987 Opcode = 988 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, 989 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, 990 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64); 991 else 992 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, 993 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg, 994 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); 995 if (!Opcode) 996 return false; 997 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 998 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 999 getI32Imm(fromTypeWidth, dl), N1, Chain }; 1000 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 1001 } 1002 1003 if (!NVPTXLD) 1004 return false; 1005 1006 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1007 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef}); 1008 1009 ReplaceNode(N, NVPTXLD); 1010 return true; 1011 } 1012 1013 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { 1014 1015 SDValue Chain = N->getOperand(0); 1016 SDValue Op1 = N->getOperand(1); 1017 SDValue Addr, Offset, Base; 1018 std::optional<unsigned> Opcode; 1019 SDLoc DL(N); 1020 SDNode *LD; 1021 MemSDNode *MemSD = cast<MemSDNode>(N); 1022 EVT LoadedVT = MemSD->getMemoryVT(); 1023 1024 if (!LoadedVT.isSimple()) 1025 return false; 1026 1027 // Address Space Setting 1028 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); 1029 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { 1030 return tryLDGLDU(N); 1031 } 1032 1033 unsigned int PointerSize = 1034 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1035 1036 // Volatile Setting 1037 // - .volatile is only availalble for .global and .shared 1038 bool IsVolatile = MemSD->isVolatile(); 1039 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1040 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1041 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1042 IsVolatile = false; 1043 1044 // Vector Setting 1045 MVT SimpleVT = LoadedVT.getSimpleVT(); 1046 1047 // Type Setting: fromType + fromTypeWidth 1048 // 1049 // Sign : ISD::SEXTLOAD 1050 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 1051 // type is integer 1052 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 1053 MVT ScalarVT = SimpleVT.getScalarType(); 1054 // Read at least 8 bits (predicates are stored as 8-bit values) 1055 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 1056 unsigned int FromType; 1057 // The last operand holds the original LoadSDNode::getExtensionType() value 1058 unsigned ExtensionType = cast<ConstantSDNode>( 1059 N->getOperand(N->getNumOperands() - 1))->getZExtValue(); 1060 if (ExtensionType == ISD::SEXTLOAD) 1061 FromType = NVPTX::PTXLdStInstCode::Signed; 1062 else 1063 FromType = getLdStRegType(ScalarVT); 1064 1065 unsigned VecType; 1066 1067 switch (N->getOpcode()) { 1068 case NVPTXISD::LoadV2: 1069 VecType = NVPTX::PTXLdStInstCode::V2; 1070 break; 1071 case NVPTXISD::LoadV4: 1072 VecType = NVPTX::PTXLdStInstCode::V4; 1073 break; 1074 default: 1075 return false; 1076 } 1077 1078 EVT EltVT = N->getValueType(0); 1079 1080 // v8x16 is a special case. PTX doesn't have ld.v8.16 1081 // instruction. Instead, we split the vector into v2x16 chunks and 1082 // load them with ld.v4.b32. 1083 if (Isv2x16VT(EltVT)) { 1084 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); 1085 EltVT = MVT::i32; 1086 FromType = NVPTX::PTXLdStInstCode::Untyped; 1087 FromTypeWidth = 32; 1088 } 1089 1090 if (SelectDirectAddr(Op1, Addr)) { 1091 switch (N->getOpcode()) { 1092 default: 1093 return false; 1094 case NVPTXISD::LoadV2: 1095 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1096 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, 1097 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, 1098 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); 1099 break; 1100 case NVPTXISD::LoadV4: 1101 Opcode = 1102 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, 1103 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, 1104 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt); 1105 break; 1106 } 1107 if (!Opcode) 1108 return false; 1109 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1110 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1111 getI32Imm(FromTypeWidth, DL), Addr, Chain }; 1112 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1113 } else if (PointerSize == 64 1114 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) 1115 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { 1116 switch (N->getOpcode()) { 1117 default: 1118 return false; 1119 case NVPTXISD::LoadV2: 1120 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1121 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, 1122 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, 1123 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); 1124 break; 1125 case NVPTXISD::LoadV4: 1126 Opcode = 1127 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, 1128 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, 1129 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt); 1130 break; 1131 } 1132 if (!Opcode) 1133 return false; 1134 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1135 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1136 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1137 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1138 } else if (PointerSize == 64 1139 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1140 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1141 if (PointerSize == 64) { 1142 switch (N->getOpcode()) { 1143 default: 1144 return false; 1145 case NVPTXISD::LoadV2: 1146 Opcode = 1147 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1148 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64, 1149 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64, 1150 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64); 1151 break; 1152 case NVPTXISD::LoadV4: 1153 Opcode = pickOpcodeForVT( 1154 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, 1155 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt, 1156 NVPTX::LDV_f32_v4_ari_64, std::nullopt); 1157 break; 1158 } 1159 } else { 1160 switch (N->getOpcode()) { 1161 default: 1162 return false; 1163 case NVPTXISD::LoadV2: 1164 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1165 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, 1166 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, 1167 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); 1168 break; 1169 case NVPTXISD::LoadV4: 1170 Opcode = 1171 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, 1172 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, 1173 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt); 1174 break; 1175 } 1176 } 1177 if (!Opcode) 1178 return false; 1179 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1180 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1181 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1182 1183 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1184 } else { 1185 if (PointerSize == 64) { 1186 switch (N->getOpcode()) { 1187 default: 1188 return false; 1189 case NVPTXISD::LoadV2: 1190 Opcode = pickOpcodeForVT( 1191 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, 1192 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, 1193 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, 1194 NVPTX::LDV_f64_v2_areg_64); 1195 break; 1196 case NVPTXISD::LoadV4: 1197 Opcode = pickOpcodeForVT( 1198 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, 1199 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt, 1200 NVPTX::LDV_f32_v4_areg_64, std::nullopt); 1201 break; 1202 } 1203 } else { 1204 switch (N->getOpcode()) { 1205 default: 1206 return false; 1207 case NVPTXISD::LoadV2: 1208 Opcode = 1209 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, 1210 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, 1211 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg, 1212 NVPTX::LDV_f64_v2_areg); 1213 break; 1214 case NVPTXISD::LoadV4: 1215 Opcode = 1216 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, 1217 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, 1218 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt); 1219 break; 1220 } 1221 } 1222 if (!Opcode) 1223 return false; 1224 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1225 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1226 getI32Imm(FromTypeWidth, DL), Op1, Chain }; 1227 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1228 } 1229 1230 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1231 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1232 1233 ReplaceNode(N, LD); 1234 return true; 1235 } 1236 1237 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { 1238 1239 SDValue Chain = N->getOperand(0); 1240 SDValue Op1; 1241 MemSDNode *Mem; 1242 bool IsLDG = true; 1243 1244 // If this is an LDG intrinsic, the address is the third operand. If its an 1245 // LDG/LDU SD node (from custom vector handling), then its the second operand 1246 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1247 Op1 = N->getOperand(2); 1248 Mem = cast<MemIntrinsicSDNode>(N); 1249 unsigned IID = N->getConstantOperandVal(1); 1250 switch (IID) { 1251 default: 1252 return false; 1253 case Intrinsic::nvvm_ldg_global_f: 1254 case Intrinsic::nvvm_ldg_global_i: 1255 case Intrinsic::nvvm_ldg_global_p: 1256 IsLDG = true; 1257 break; 1258 case Intrinsic::nvvm_ldu_global_f: 1259 case Intrinsic::nvvm_ldu_global_i: 1260 case Intrinsic::nvvm_ldu_global_p: 1261 IsLDG = false; 1262 break; 1263 } 1264 } else { 1265 Op1 = N->getOperand(1); 1266 Mem = cast<MemSDNode>(N); 1267 } 1268 1269 std::optional<unsigned> Opcode; 1270 SDLoc DL(N); 1271 SDNode *LD; 1272 SDValue Base, Offset, Addr; 1273 EVT OrigType = N->getValueType(0); 1274 1275 EVT EltVT = Mem->getMemoryVT(); 1276 unsigned NumElts = 1; 1277 if (EltVT.isVector()) { 1278 NumElts = EltVT.getVectorNumElements(); 1279 EltVT = EltVT.getVectorElementType(); 1280 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. 1281 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || 1282 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || 1283 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { 1284 assert(NumElts % 2 == 0 && "Vector must have even number of elements"); 1285 EltVT = OrigType; 1286 NumElts /= 2; 1287 } else if (OrigType == MVT::v4i8) { 1288 EltVT = OrigType; 1289 NumElts = 1; 1290 } 1291 } 1292 1293 // Build the "promoted" result VTList for the load. If we are really loading 1294 // i8s, then the return type will be promoted to i16 since we do not expose 1295 // 8-bit registers in NVPTX. 1296 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; 1297 SmallVector<EVT, 5> InstVTs; 1298 for (unsigned i = 0; i != NumElts; ++i) { 1299 InstVTs.push_back(NodeVT); 1300 } 1301 InstVTs.push_back(MVT::Other); 1302 SDVTList InstVTList = CurDAG->getVTList(InstVTs); 1303 1304 if (SelectDirectAddr(Op1, Addr)) { 1305 switch (N->getOpcode()) { 1306 default: 1307 return false; 1308 case ISD::LOAD: 1309 case ISD::INTRINSIC_W_CHAIN: 1310 if (IsLDG) 1311 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1312 NVPTX::INT_PTX_LDG_GLOBAL_i8avar, 1313 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, 1314 NVPTX::INT_PTX_LDG_GLOBAL_i32avar, 1315 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, 1316 NVPTX::INT_PTX_LDG_GLOBAL_f32avar, 1317 NVPTX::INT_PTX_LDG_GLOBAL_f64avar); 1318 else 1319 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1320 NVPTX::INT_PTX_LDU_GLOBAL_i8avar, 1321 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, 1322 NVPTX::INT_PTX_LDU_GLOBAL_i32avar, 1323 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, 1324 NVPTX::INT_PTX_LDU_GLOBAL_f32avar, 1325 NVPTX::INT_PTX_LDU_GLOBAL_f64avar); 1326 break; 1327 case NVPTXISD::LoadV2: 1328 case NVPTXISD::LDGV2: 1329 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1330 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, 1331 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, 1332 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, 1333 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, 1334 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, 1335 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); 1336 break; 1337 case NVPTXISD::LDUV2: 1338 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1339 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, 1340 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, 1341 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, 1342 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, 1343 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, 1344 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); 1345 break; 1346 case NVPTXISD::LoadV4: 1347 case NVPTXISD::LDGV4: 1348 Opcode = pickOpcodeForVT( 1349 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, 1350 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, 1351 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt, 1352 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt); 1353 break; 1354 case NVPTXISD::LDUV4: 1355 Opcode = pickOpcodeForVT( 1356 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, 1357 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, 1358 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt, 1359 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt); 1360 break; 1361 } 1362 if (!Opcode) 1363 return false; 1364 SDValue Ops[] = { Addr, Chain }; 1365 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1366 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1367 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1368 if (TM.is64Bit()) { 1369 switch (N->getOpcode()) { 1370 default: 1371 return false; 1372 case ISD::LOAD: 1373 case ISD::INTRINSIC_W_CHAIN: 1374 if (IsLDG) 1375 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1376 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, 1377 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, 1378 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, 1379 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, 1380 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, 1381 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); 1382 else 1383 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1384 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, 1385 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, 1386 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, 1387 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, 1388 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, 1389 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); 1390 break; 1391 case NVPTXISD::LoadV2: 1392 case NVPTXISD::LDGV2: 1393 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1394 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, 1395 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, 1396 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, 1397 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, 1398 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, 1399 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); 1400 break; 1401 case NVPTXISD::LDUV2: 1402 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1403 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, 1404 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, 1405 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, 1406 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, 1407 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, 1408 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); 1409 break; 1410 case NVPTXISD::LoadV4: 1411 case NVPTXISD::LDGV4: 1412 Opcode = pickOpcodeForVT( 1413 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, 1414 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, 1415 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt, 1416 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt); 1417 break; 1418 case NVPTXISD::LDUV4: 1419 Opcode = pickOpcodeForVT( 1420 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, 1421 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, 1422 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt, 1423 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt); 1424 break; 1425 } 1426 } else { 1427 switch (N->getOpcode()) { 1428 default: 1429 return false; 1430 case ISD::LOAD: 1431 case ISD::INTRINSIC_W_CHAIN: 1432 if (IsLDG) 1433 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1434 NVPTX::INT_PTX_LDG_GLOBAL_i8ari, 1435 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, 1436 NVPTX::INT_PTX_LDG_GLOBAL_i32ari, 1437 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, 1438 NVPTX::INT_PTX_LDG_GLOBAL_f32ari, 1439 NVPTX::INT_PTX_LDG_GLOBAL_f64ari); 1440 else 1441 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1442 NVPTX::INT_PTX_LDU_GLOBAL_i8ari, 1443 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, 1444 NVPTX::INT_PTX_LDU_GLOBAL_i32ari, 1445 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, 1446 NVPTX::INT_PTX_LDU_GLOBAL_f32ari, 1447 NVPTX::INT_PTX_LDU_GLOBAL_f64ari); 1448 break; 1449 case NVPTXISD::LoadV2: 1450 case NVPTXISD::LDGV2: 1451 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1452 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, 1453 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, 1454 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, 1455 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, 1456 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, 1457 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); 1458 break; 1459 case NVPTXISD::LDUV2: 1460 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1461 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, 1462 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, 1463 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, 1464 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, 1465 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, 1466 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); 1467 break; 1468 case NVPTXISD::LoadV4: 1469 case NVPTXISD::LDGV4: 1470 Opcode = pickOpcodeForVT( 1471 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, 1472 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, 1473 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt, 1474 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt); 1475 break; 1476 case NVPTXISD::LDUV4: 1477 Opcode = pickOpcodeForVT( 1478 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, 1479 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, 1480 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt, 1481 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt); 1482 break; 1483 } 1484 } 1485 if (!Opcode) 1486 return false; 1487 SDValue Ops[] = {Base, Offset, Chain}; 1488 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1489 } else { 1490 if (TM.is64Bit()) { 1491 switch (N->getOpcode()) { 1492 default: 1493 return false; 1494 case ISD::LOAD: 1495 case ISD::INTRINSIC_W_CHAIN: 1496 if (IsLDG) 1497 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1498 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, 1499 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, 1500 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, 1501 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, 1502 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, 1503 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); 1504 else 1505 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1506 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, 1507 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, 1508 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, 1509 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, 1510 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, 1511 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); 1512 break; 1513 case NVPTXISD::LoadV2: 1514 case NVPTXISD::LDGV2: 1515 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1516 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, 1517 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, 1518 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, 1519 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, 1520 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, 1521 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); 1522 break; 1523 case NVPTXISD::LDUV2: 1524 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1525 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, 1526 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, 1527 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, 1528 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, 1529 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, 1530 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); 1531 break; 1532 case NVPTXISD::LoadV4: 1533 case NVPTXISD::LDGV4: 1534 Opcode = pickOpcodeForVT( 1535 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, 1536 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, 1537 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt, 1538 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt); 1539 break; 1540 case NVPTXISD::LDUV4: 1541 Opcode = pickOpcodeForVT( 1542 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, 1543 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, 1544 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt, 1545 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt); 1546 break; 1547 } 1548 } else { 1549 switch (N->getOpcode()) { 1550 default: 1551 return false; 1552 case ISD::LOAD: 1553 case ISD::INTRINSIC_W_CHAIN: 1554 if (IsLDG) 1555 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1556 NVPTX::INT_PTX_LDG_GLOBAL_i8areg, 1557 NVPTX::INT_PTX_LDG_GLOBAL_i16areg, 1558 NVPTX::INT_PTX_LDG_GLOBAL_i32areg, 1559 NVPTX::INT_PTX_LDG_GLOBAL_i64areg, 1560 NVPTX::INT_PTX_LDG_GLOBAL_f32areg, 1561 NVPTX::INT_PTX_LDG_GLOBAL_f64areg); 1562 else 1563 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1564 NVPTX::INT_PTX_LDU_GLOBAL_i8areg, 1565 NVPTX::INT_PTX_LDU_GLOBAL_i16areg, 1566 NVPTX::INT_PTX_LDU_GLOBAL_i32areg, 1567 NVPTX::INT_PTX_LDU_GLOBAL_i64areg, 1568 NVPTX::INT_PTX_LDU_GLOBAL_f32areg, 1569 NVPTX::INT_PTX_LDU_GLOBAL_f64areg); 1570 break; 1571 case NVPTXISD::LoadV2: 1572 case NVPTXISD::LDGV2: 1573 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1574 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, 1575 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, 1576 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, 1577 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, 1578 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, 1579 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); 1580 break; 1581 case NVPTXISD::LDUV2: 1582 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1583 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, 1584 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, 1585 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, 1586 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, 1587 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, 1588 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); 1589 break; 1590 case NVPTXISD::LoadV4: 1591 case NVPTXISD::LDGV4: 1592 Opcode = pickOpcodeForVT( 1593 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, 1594 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, 1595 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt, 1596 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt); 1597 break; 1598 case NVPTXISD::LDUV4: 1599 Opcode = pickOpcodeForVT( 1600 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, 1601 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, 1602 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt, 1603 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt); 1604 break; 1605 } 1606 } 1607 if (!Opcode) 1608 return false; 1609 SDValue Ops[] = { Op1, Chain }; 1610 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1611 } 1612 1613 // For automatic generation of LDG (through SelectLoad[Vector], not the 1614 // intrinsics), we may have an extending load like: 1615 // 1616 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 1617 // 1618 // In this case, the matching logic above will select a load for the original 1619 // memory type (in this case, i8) and our types will not match (the node needs 1620 // to return an i32 in this case). Our LDG/LDU nodes do not support the 1621 // concept of sign-/zero-extension, so emulate it here by adding an explicit 1622 // CVT instruction. Ptxas should clean up any redundancies here. 1623 1624 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); 1625 1626 if (OrigType != EltVT && 1627 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { 1628 // We have an extending-load. The instruction we selected operates on the 1629 // smaller type, but the SDNode we are replacing has the larger type. We 1630 // need to emit a CVT to make the types match. 1631 unsigned CvtOpc = 1632 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); 1633 1634 // For each output value, apply the manual sign/zero-extension and make sure 1635 // all users of the load go through that CVT. 1636 for (unsigned i = 0; i != NumElts; ++i) { 1637 SDValue Res(LD, i); 1638 SDValue OrigVal(N, i); 1639 1640 SDNode *CvtNode = 1641 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, 1642 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, 1643 DL, MVT::i32)); 1644 ReplaceUses(OrigVal, SDValue(CvtNode, 0)); 1645 } 1646 } 1647 1648 ReplaceNode(N, LD); 1649 return true; 1650 } 1651 1652 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { 1653 SDLoc dl(N); 1654 MemSDNode *ST = cast<MemSDNode>(N); 1655 assert(ST->writeMem() && "Expected store"); 1656 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); 1657 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); 1658 assert((PlainStore || AtomicStore) && "Expected store"); 1659 EVT StoreVT = ST->getMemoryVT(); 1660 SDNode *NVPTXST = nullptr; 1661 1662 // do not support pre/post inc/dec 1663 if (PlainStore && PlainStore->isIndexed()) 1664 return false; 1665 1666 if (!StoreVT.isSimple()) 1667 return false; 1668 1669 AtomicOrdering Ordering = ST->getSuccessOrdering(); 1670 // In order to lower atomic loads with stronger guarantees we would need to 1671 // use store.release or insert fences. However these features were only added 1672 // with PTX ISA 6.0 / sm_70. 1673 // TODO: Check if we can actually use the new instructions and implement them. 1674 if (isStrongerThanMonotonic(Ordering)) 1675 return false; 1676 1677 // Address Space Setting 1678 unsigned int CodeAddrSpace = getCodeAddrSpace(ST); 1679 unsigned int PointerSize = 1680 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); 1681 1682 // Volatile Setting 1683 // - .volatile is only available for .global and .shared 1684 // - .volatile has the same memory synchronization semantics as .relaxed.sys 1685 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; 1686 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1687 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1688 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1689 isVolatile = false; 1690 1691 // Vector Setting 1692 MVT SimpleVT = StoreVT.getSimpleVT(); 1693 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 1694 1695 // Type Setting: toType + toTypeWidth 1696 // - for integer type, always use 'u' 1697 // 1698 MVT ScalarVT = SimpleVT.getScalarType(); 1699 unsigned toTypeWidth = ScalarVT.getSizeInBits(); 1700 if (SimpleVT.isVector()) { 1701 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && 1702 "Unexpected vector type"); 1703 // v2x16 is stored using st.b32 1704 toTypeWidth = 32; 1705 } 1706 1707 unsigned int toType = getLdStRegType(ScalarVT); 1708 1709 // Create the machine instruction DAG 1710 SDValue Chain = ST->getChain(); 1711 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); 1712 SDValue BasePtr = ST->getBasePtr(); 1713 SDValue Addr; 1714 SDValue Offset, Base; 1715 std::optional<unsigned> Opcode; 1716 MVT::SimpleValueType SourceVT = 1717 Value.getNode()->getSimpleValueType(0).SimpleTy; 1718 1719 if (SelectDirectAddr(BasePtr, Addr)) { 1720 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, 1721 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, 1722 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); 1723 if (!Opcode) 1724 return false; 1725 SDValue Ops[] = {Value, 1726 getI32Imm(isVolatile, dl), 1727 getI32Imm(CodeAddrSpace, dl), 1728 getI32Imm(vecType, dl), 1729 getI32Imm(toType, dl), 1730 getI32Imm(toTypeWidth, dl), 1731 Addr, 1732 Chain}; 1733 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1734 } else if (PointerSize == 64 1735 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) 1736 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { 1737 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, 1738 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, 1739 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); 1740 if (!Opcode) 1741 return false; 1742 SDValue Ops[] = {Value, 1743 getI32Imm(isVolatile, dl), 1744 getI32Imm(CodeAddrSpace, dl), 1745 getI32Imm(vecType, dl), 1746 getI32Imm(toType, dl), 1747 getI32Imm(toTypeWidth, dl), 1748 Base, 1749 Offset, 1750 Chain}; 1751 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1752 } else if (PointerSize == 64 1753 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) 1754 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { 1755 if (PointerSize == 64) 1756 Opcode = 1757 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, 1758 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, 1759 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); 1760 else 1761 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, 1762 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, 1763 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); 1764 if (!Opcode) 1765 return false; 1766 1767 SDValue Ops[] = {Value, 1768 getI32Imm(isVolatile, dl), 1769 getI32Imm(CodeAddrSpace, dl), 1770 getI32Imm(vecType, dl), 1771 getI32Imm(toType, dl), 1772 getI32Imm(toTypeWidth, dl), 1773 Base, 1774 Offset, 1775 Chain}; 1776 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1777 } else { 1778 if (PointerSize == 64) 1779 Opcode = 1780 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, 1781 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, 1782 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); 1783 else 1784 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, 1785 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, 1786 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); 1787 if (!Opcode) 1788 return false; 1789 SDValue Ops[] = {Value, 1790 getI32Imm(isVolatile, dl), 1791 getI32Imm(CodeAddrSpace, dl), 1792 getI32Imm(vecType, dl), 1793 getI32Imm(toType, dl), 1794 getI32Imm(toTypeWidth, dl), 1795 BasePtr, 1796 Chain}; 1797 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1798 } 1799 1800 if (!NVPTXST) 1801 return false; 1802 1803 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1804 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef}); 1805 ReplaceNode(N, NVPTXST); 1806 return true; 1807 } 1808 1809 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { 1810 SDValue Chain = N->getOperand(0); 1811 SDValue Op1 = N->getOperand(1); 1812 SDValue Addr, Offset, Base; 1813 std::optional<unsigned> Opcode; 1814 SDLoc DL(N); 1815 SDNode *ST; 1816 EVT EltVT = Op1.getValueType(); 1817 MemSDNode *MemSD = cast<MemSDNode>(N); 1818 EVT StoreVT = MemSD->getMemoryVT(); 1819 1820 // Address Space Setting 1821 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); 1822 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { 1823 report_fatal_error("Cannot store to pointer that points to constant " 1824 "memory space"); 1825 } 1826 unsigned int PointerSize = 1827 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1828 1829 // Volatile Setting 1830 // - .volatile is only availalble for .global and .shared 1831 bool IsVolatile = MemSD->isVolatile(); 1832 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1833 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1834 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1835 IsVolatile = false; 1836 1837 // Type Setting: toType + toTypeWidth 1838 // - for integer type, always use 'u' 1839 assert(StoreVT.isSimple() && "Store value is not simple"); 1840 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); 1841 unsigned ToTypeWidth = ScalarVT.getSizeInBits(); 1842 unsigned ToType = getLdStRegType(ScalarVT); 1843 1844 SmallVector<SDValue, 12> StOps; 1845 SDValue N2; 1846 unsigned VecType; 1847 1848 switch (N->getOpcode()) { 1849 case NVPTXISD::StoreV2: 1850 VecType = NVPTX::PTXLdStInstCode::V2; 1851 StOps.push_back(N->getOperand(1)); 1852 StOps.push_back(N->getOperand(2)); 1853 N2 = N->getOperand(3); 1854 break; 1855 case NVPTXISD::StoreV4: 1856 VecType = NVPTX::PTXLdStInstCode::V4; 1857 StOps.push_back(N->getOperand(1)); 1858 StOps.push_back(N->getOperand(2)); 1859 StOps.push_back(N->getOperand(3)); 1860 StOps.push_back(N->getOperand(4)); 1861 N2 = N->getOperand(5); 1862 break; 1863 default: 1864 return false; 1865 } 1866 1867 // v8x16 is a special case. PTX doesn't have st.v8.x16 1868 // instruction. Instead, we split the vector into v2x16 chunks and 1869 // store them with st.v4.b32. 1870 if (Isv2x16VT(EltVT)) { 1871 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); 1872 EltVT = MVT::i32; 1873 ToType = NVPTX::PTXLdStInstCode::Untyped; 1874 ToTypeWidth = 32; 1875 } 1876 1877 StOps.push_back(getI32Imm(IsVolatile, DL)); 1878 StOps.push_back(getI32Imm(CodeAddrSpace, DL)); 1879 StOps.push_back(getI32Imm(VecType, DL)); 1880 StOps.push_back(getI32Imm(ToType, DL)); 1881 StOps.push_back(getI32Imm(ToTypeWidth, DL)); 1882 1883 if (SelectDirectAddr(N2, Addr)) { 1884 switch (N->getOpcode()) { 1885 default: 1886 return false; 1887 case NVPTXISD::StoreV2: 1888 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1889 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, 1890 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, 1891 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); 1892 break; 1893 case NVPTXISD::StoreV4: 1894 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1895 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar, 1896 NVPTX::STV_i32_v4_avar, std::nullopt, 1897 NVPTX::STV_f32_v4_avar, std::nullopt); 1898 break; 1899 } 1900 StOps.push_back(Addr); 1901 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) 1902 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { 1903 switch (N->getOpcode()) { 1904 default: 1905 return false; 1906 case NVPTXISD::StoreV2: 1907 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1908 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, 1909 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, 1910 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); 1911 break; 1912 case NVPTXISD::StoreV4: 1913 Opcode = 1914 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, 1915 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, 1916 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); 1917 break; 1918 } 1919 StOps.push_back(Base); 1920 StOps.push_back(Offset); 1921 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) 1922 : SelectADDRri(N2.getNode(), N2, Base, Offset)) { 1923 if (PointerSize == 64) { 1924 switch (N->getOpcode()) { 1925 default: 1926 return false; 1927 case NVPTXISD::StoreV2: 1928 Opcode = 1929 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1930 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64, 1931 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64, 1932 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64); 1933 break; 1934 case NVPTXISD::StoreV4: 1935 Opcode = pickOpcodeForVT( 1936 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, 1937 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt, 1938 NVPTX::STV_f32_v4_ari_64, std::nullopt); 1939 break; 1940 } 1941 } else { 1942 switch (N->getOpcode()) { 1943 default: 1944 return false; 1945 case NVPTXISD::StoreV2: 1946 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1947 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, 1948 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, 1949 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); 1950 break; 1951 case NVPTXISD::StoreV4: 1952 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1953 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari, 1954 NVPTX::STV_i32_v4_ari, std::nullopt, 1955 NVPTX::STV_f32_v4_ari, std::nullopt); 1956 break; 1957 } 1958 } 1959 StOps.push_back(Base); 1960 StOps.push_back(Offset); 1961 } else { 1962 if (PointerSize == 64) { 1963 switch (N->getOpcode()) { 1964 default: 1965 return false; 1966 case NVPTXISD::StoreV2: 1967 Opcode = pickOpcodeForVT( 1968 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, 1969 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, 1970 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64, 1971 NVPTX::STV_f64_v2_areg_64); 1972 break; 1973 case NVPTXISD::StoreV4: 1974 Opcode = pickOpcodeForVT( 1975 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, 1976 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt, 1977 NVPTX::STV_f32_v4_areg_64, std::nullopt); 1978 break; 1979 } 1980 } else { 1981 switch (N->getOpcode()) { 1982 default: 1983 return false; 1984 case NVPTXISD::StoreV2: 1985 Opcode = 1986 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, 1987 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, 1988 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg, 1989 NVPTX::STV_f64_v2_areg); 1990 break; 1991 case NVPTXISD::StoreV4: 1992 Opcode = 1993 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, 1994 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, 1995 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt); 1996 break; 1997 } 1998 } 1999 StOps.push_back(N2); 2000 } 2001 2002 if (!Opcode) 2003 return false; 2004 2005 StOps.push_back(Chain); 2006 2007 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); 2008 2009 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2010 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); 2011 2012 ReplaceNode(N, ST); 2013 return true; 2014 } 2015 2016 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { 2017 SDValue Chain = Node->getOperand(0); 2018 SDValue Offset = Node->getOperand(2); 2019 SDValue Glue = Node->getOperand(3); 2020 SDLoc DL(Node); 2021 MemSDNode *Mem = cast<MemSDNode>(Node); 2022 2023 unsigned VecSize; 2024 switch (Node->getOpcode()) { 2025 default: 2026 return false; 2027 case NVPTXISD::LoadParam: 2028 VecSize = 1; 2029 break; 2030 case NVPTXISD::LoadParamV2: 2031 VecSize = 2; 2032 break; 2033 case NVPTXISD::LoadParamV4: 2034 VecSize = 4; 2035 break; 2036 } 2037 2038 EVT EltVT = Node->getValueType(0); 2039 EVT MemVT = Mem->getMemoryVT(); 2040 2041 std::optional<unsigned> Opcode; 2042 2043 switch (VecSize) { 2044 default: 2045 return false; 2046 case 1: 2047 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, 2048 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, 2049 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, 2050 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); 2051 break; 2052 case 2: 2053 Opcode = 2054 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, 2055 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, 2056 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32, 2057 NVPTX::LoadParamMemV2F64); 2058 break; 2059 case 4: 2060 Opcode = 2061 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, 2062 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, 2063 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); 2064 break; 2065 } 2066 if (!Opcode) 2067 return false; 2068 2069 SDVTList VTs; 2070 if (VecSize == 1) { 2071 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); 2072 } else if (VecSize == 2) { 2073 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); 2074 } else { 2075 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; 2076 VTs = CurDAG->getVTList(EVTs); 2077 } 2078 2079 unsigned OffsetVal = Offset->getAsZExtVal(); 2080 2081 SmallVector<SDValue, 2> Ops; 2082 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2083 Ops.push_back(Chain); 2084 Ops.push_back(Glue); 2085 2086 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); 2087 return true; 2088 } 2089 2090 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { 2091 SDLoc DL(N); 2092 SDValue Chain = N->getOperand(0); 2093 SDValue Offset = N->getOperand(1); 2094 unsigned OffsetVal = Offset->getAsZExtVal(); 2095 MemSDNode *Mem = cast<MemSDNode>(N); 2096 2097 // How many elements do we have? 2098 unsigned NumElts = 1; 2099 switch (N->getOpcode()) { 2100 default: 2101 return false; 2102 case NVPTXISD::StoreRetval: 2103 NumElts = 1; 2104 break; 2105 case NVPTXISD::StoreRetvalV2: 2106 NumElts = 2; 2107 break; 2108 case NVPTXISD::StoreRetvalV4: 2109 NumElts = 4; 2110 break; 2111 } 2112 2113 // Build vector of operands 2114 SmallVector<SDValue, 6> Ops; 2115 for (unsigned i = 0; i < NumElts; ++i) 2116 Ops.push_back(N->getOperand(i + 2)); 2117 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2118 Ops.push_back(Chain); 2119 2120 // Determine target opcode 2121 // If we have an i1, use an 8-bit store. The lowering code in 2122 // NVPTXISelLowering will have already emitted an upcast. 2123 std::optional<unsigned> Opcode = 0; 2124 switch (NumElts) { 2125 default: 2126 return false; 2127 case 1: 2128 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2129 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, 2130 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, 2131 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); 2132 break; 2133 case 2: 2134 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2135 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, 2136 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, 2137 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); 2138 break; 2139 case 4: 2140 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2141 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, 2142 NVPTX::StoreRetvalV4I32, std::nullopt, 2143 NVPTX::StoreRetvalV4F32, std::nullopt); 2144 break; 2145 } 2146 if (!Opcode) 2147 return false; 2148 2149 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); 2150 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2151 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2152 2153 ReplaceNode(N, Ret); 2154 return true; 2155 } 2156 2157 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { 2158 SDLoc DL(N); 2159 SDValue Chain = N->getOperand(0); 2160 SDValue Param = N->getOperand(1); 2161 unsigned ParamVal = Param->getAsZExtVal(); 2162 SDValue Offset = N->getOperand(2); 2163 unsigned OffsetVal = Offset->getAsZExtVal(); 2164 MemSDNode *Mem = cast<MemSDNode>(N); 2165 SDValue Glue = N->getOperand(N->getNumOperands() - 1); 2166 2167 // How many elements do we have? 2168 unsigned NumElts = 1; 2169 switch (N->getOpcode()) { 2170 default: 2171 return false; 2172 case NVPTXISD::StoreParamU32: 2173 case NVPTXISD::StoreParamS32: 2174 case NVPTXISD::StoreParam: 2175 NumElts = 1; 2176 break; 2177 case NVPTXISD::StoreParamV2: 2178 NumElts = 2; 2179 break; 2180 case NVPTXISD::StoreParamV4: 2181 NumElts = 4; 2182 break; 2183 } 2184 2185 // Build vector of operands 2186 SmallVector<SDValue, 8> Ops; 2187 for (unsigned i = 0; i < NumElts; ++i) 2188 Ops.push_back(N->getOperand(i + 3)); 2189 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); 2190 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2191 Ops.push_back(Chain); 2192 Ops.push_back(Glue); 2193 2194 // Determine target opcode 2195 // If we have an i1, use an 8-bit store. The lowering code in 2196 // NVPTXISelLowering will have already emitted an upcast. 2197 std::optional<unsigned> Opcode = 0; 2198 switch (N->getOpcode()) { 2199 default: 2200 switch (NumElts) { 2201 default: 2202 return false; 2203 case 1: 2204 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2205 NVPTX::StoreParamI8, NVPTX::StoreParamI16, 2206 NVPTX::StoreParamI32, NVPTX::StoreParamI64, 2207 NVPTX::StoreParamF32, NVPTX::StoreParamF64); 2208 break; 2209 case 2: 2210 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2211 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16, 2212 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64, 2213 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64); 2214 break; 2215 case 4: 2216 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2217 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16, 2218 NVPTX::StoreParamV4I32, std::nullopt, 2219 NVPTX::StoreParamV4F32, std::nullopt); 2220 break; 2221 } 2222 if (!Opcode) 2223 return false; 2224 break; 2225 // Special case: if we have a sign-extend/zero-extend node, insert the 2226 // conversion instruction first, and use that as the value operand to 2227 // the selected StoreParam node. 2228 case NVPTXISD::StoreParamU32: { 2229 Opcode = NVPTX::StoreParamI32; 2230 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2231 MVT::i32); 2232 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, 2233 MVT::i32, Ops[0], CvtNone); 2234 Ops[0] = SDValue(Cvt, 0); 2235 break; 2236 } 2237 case NVPTXISD::StoreParamS32: { 2238 Opcode = NVPTX::StoreParamI32; 2239 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2240 MVT::i32); 2241 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, 2242 MVT::i32, Ops[0], CvtNone); 2243 Ops[0] = SDValue(Cvt, 0); 2244 break; 2245 } 2246 } 2247 2248 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 2249 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); 2250 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2251 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2252 2253 ReplaceNode(N, Ret); 2254 return true; 2255 } 2256 2257 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { 2258 unsigned Opc = 0; 2259 2260 switch (N->getOpcode()) { 2261 default: return false; 2262 case NVPTXISD::Tex1DFloatS32: 2263 Opc = NVPTX::TEX_1D_F32_S32_RR; 2264 break; 2265 case NVPTXISD::Tex1DFloatFloat: 2266 Opc = NVPTX::TEX_1D_F32_F32_RR; 2267 break; 2268 case NVPTXISD::Tex1DFloatFloatLevel: 2269 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; 2270 break; 2271 case NVPTXISD::Tex1DFloatFloatGrad: 2272 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; 2273 break; 2274 case NVPTXISD::Tex1DS32S32: 2275 Opc = NVPTX::TEX_1D_S32_S32_RR; 2276 break; 2277 case NVPTXISD::Tex1DS32Float: 2278 Opc = NVPTX::TEX_1D_S32_F32_RR; 2279 break; 2280 case NVPTXISD::Tex1DS32FloatLevel: 2281 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; 2282 break; 2283 case NVPTXISD::Tex1DS32FloatGrad: 2284 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; 2285 break; 2286 case NVPTXISD::Tex1DU32S32: 2287 Opc = NVPTX::TEX_1D_U32_S32_RR; 2288 break; 2289 case NVPTXISD::Tex1DU32Float: 2290 Opc = NVPTX::TEX_1D_U32_F32_RR; 2291 break; 2292 case NVPTXISD::Tex1DU32FloatLevel: 2293 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; 2294 break; 2295 case NVPTXISD::Tex1DU32FloatGrad: 2296 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; 2297 break; 2298 case NVPTXISD::Tex1DArrayFloatS32: 2299 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; 2300 break; 2301 case NVPTXISD::Tex1DArrayFloatFloat: 2302 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; 2303 break; 2304 case NVPTXISD::Tex1DArrayFloatFloatLevel: 2305 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; 2306 break; 2307 case NVPTXISD::Tex1DArrayFloatFloatGrad: 2308 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; 2309 break; 2310 case NVPTXISD::Tex1DArrayS32S32: 2311 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; 2312 break; 2313 case NVPTXISD::Tex1DArrayS32Float: 2314 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; 2315 break; 2316 case NVPTXISD::Tex1DArrayS32FloatLevel: 2317 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; 2318 break; 2319 case NVPTXISD::Tex1DArrayS32FloatGrad: 2320 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; 2321 break; 2322 case NVPTXISD::Tex1DArrayU32S32: 2323 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; 2324 break; 2325 case NVPTXISD::Tex1DArrayU32Float: 2326 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; 2327 break; 2328 case NVPTXISD::Tex1DArrayU32FloatLevel: 2329 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; 2330 break; 2331 case NVPTXISD::Tex1DArrayU32FloatGrad: 2332 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; 2333 break; 2334 case NVPTXISD::Tex2DFloatS32: 2335 Opc = NVPTX::TEX_2D_F32_S32_RR; 2336 break; 2337 case NVPTXISD::Tex2DFloatFloat: 2338 Opc = NVPTX::TEX_2D_F32_F32_RR; 2339 break; 2340 case NVPTXISD::Tex2DFloatFloatLevel: 2341 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; 2342 break; 2343 case NVPTXISD::Tex2DFloatFloatGrad: 2344 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; 2345 break; 2346 case NVPTXISD::Tex2DS32S32: 2347 Opc = NVPTX::TEX_2D_S32_S32_RR; 2348 break; 2349 case NVPTXISD::Tex2DS32Float: 2350 Opc = NVPTX::TEX_2D_S32_F32_RR; 2351 break; 2352 case NVPTXISD::Tex2DS32FloatLevel: 2353 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; 2354 break; 2355 case NVPTXISD::Tex2DS32FloatGrad: 2356 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; 2357 break; 2358 case NVPTXISD::Tex2DU32S32: 2359 Opc = NVPTX::TEX_2D_U32_S32_RR; 2360 break; 2361 case NVPTXISD::Tex2DU32Float: 2362 Opc = NVPTX::TEX_2D_U32_F32_RR; 2363 break; 2364 case NVPTXISD::Tex2DU32FloatLevel: 2365 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; 2366 break; 2367 case NVPTXISD::Tex2DU32FloatGrad: 2368 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; 2369 break; 2370 case NVPTXISD::Tex2DArrayFloatS32: 2371 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; 2372 break; 2373 case NVPTXISD::Tex2DArrayFloatFloat: 2374 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; 2375 break; 2376 case NVPTXISD::Tex2DArrayFloatFloatLevel: 2377 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; 2378 break; 2379 case NVPTXISD::Tex2DArrayFloatFloatGrad: 2380 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; 2381 break; 2382 case NVPTXISD::Tex2DArrayS32S32: 2383 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; 2384 break; 2385 case NVPTXISD::Tex2DArrayS32Float: 2386 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; 2387 break; 2388 case NVPTXISD::Tex2DArrayS32FloatLevel: 2389 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; 2390 break; 2391 case NVPTXISD::Tex2DArrayS32FloatGrad: 2392 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; 2393 break; 2394 case NVPTXISD::Tex2DArrayU32S32: 2395 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; 2396 break; 2397 case NVPTXISD::Tex2DArrayU32Float: 2398 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; 2399 break; 2400 case NVPTXISD::Tex2DArrayU32FloatLevel: 2401 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; 2402 break; 2403 case NVPTXISD::Tex2DArrayU32FloatGrad: 2404 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; 2405 break; 2406 case NVPTXISD::Tex3DFloatS32: 2407 Opc = NVPTX::TEX_3D_F32_S32_RR; 2408 break; 2409 case NVPTXISD::Tex3DFloatFloat: 2410 Opc = NVPTX::TEX_3D_F32_F32_RR; 2411 break; 2412 case NVPTXISD::Tex3DFloatFloatLevel: 2413 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; 2414 break; 2415 case NVPTXISD::Tex3DFloatFloatGrad: 2416 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; 2417 break; 2418 case NVPTXISD::Tex3DS32S32: 2419 Opc = NVPTX::TEX_3D_S32_S32_RR; 2420 break; 2421 case NVPTXISD::Tex3DS32Float: 2422 Opc = NVPTX::TEX_3D_S32_F32_RR; 2423 break; 2424 case NVPTXISD::Tex3DS32FloatLevel: 2425 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; 2426 break; 2427 case NVPTXISD::Tex3DS32FloatGrad: 2428 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; 2429 break; 2430 case NVPTXISD::Tex3DU32S32: 2431 Opc = NVPTX::TEX_3D_U32_S32_RR; 2432 break; 2433 case NVPTXISD::Tex3DU32Float: 2434 Opc = NVPTX::TEX_3D_U32_F32_RR; 2435 break; 2436 case NVPTXISD::Tex3DU32FloatLevel: 2437 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; 2438 break; 2439 case NVPTXISD::Tex3DU32FloatGrad: 2440 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; 2441 break; 2442 case NVPTXISD::TexCubeFloatFloat: 2443 Opc = NVPTX::TEX_CUBE_F32_F32_RR; 2444 break; 2445 case NVPTXISD::TexCubeFloatFloatLevel: 2446 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; 2447 break; 2448 case NVPTXISD::TexCubeS32Float: 2449 Opc = NVPTX::TEX_CUBE_S32_F32_RR; 2450 break; 2451 case NVPTXISD::TexCubeS32FloatLevel: 2452 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; 2453 break; 2454 case NVPTXISD::TexCubeU32Float: 2455 Opc = NVPTX::TEX_CUBE_U32_F32_RR; 2456 break; 2457 case NVPTXISD::TexCubeU32FloatLevel: 2458 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; 2459 break; 2460 case NVPTXISD::TexCubeArrayFloatFloat: 2461 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; 2462 break; 2463 case NVPTXISD::TexCubeArrayFloatFloatLevel: 2464 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; 2465 break; 2466 case NVPTXISD::TexCubeArrayS32Float: 2467 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; 2468 break; 2469 case NVPTXISD::TexCubeArrayS32FloatLevel: 2470 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; 2471 break; 2472 case NVPTXISD::TexCubeArrayU32Float: 2473 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; 2474 break; 2475 case NVPTXISD::TexCubeArrayU32FloatLevel: 2476 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; 2477 break; 2478 case NVPTXISD::Tld4R2DFloatFloat: 2479 Opc = NVPTX::TLD4_R_2D_F32_F32_RR; 2480 break; 2481 case NVPTXISD::Tld4G2DFloatFloat: 2482 Opc = NVPTX::TLD4_G_2D_F32_F32_RR; 2483 break; 2484 case NVPTXISD::Tld4B2DFloatFloat: 2485 Opc = NVPTX::TLD4_B_2D_F32_F32_RR; 2486 break; 2487 case NVPTXISD::Tld4A2DFloatFloat: 2488 Opc = NVPTX::TLD4_A_2D_F32_F32_RR; 2489 break; 2490 case NVPTXISD::Tld4R2DS64Float: 2491 Opc = NVPTX::TLD4_R_2D_S32_F32_RR; 2492 break; 2493 case NVPTXISD::Tld4G2DS64Float: 2494 Opc = NVPTX::TLD4_G_2D_S32_F32_RR; 2495 break; 2496 case NVPTXISD::Tld4B2DS64Float: 2497 Opc = NVPTX::TLD4_B_2D_S32_F32_RR; 2498 break; 2499 case NVPTXISD::Tld4A2DS64Float: 2500 Opc = NVPTX::TLD4_A_2D_S32_F32_RR; 2501 break; 2502 case NVPTXISD::Tld4R2DU64Float: 2503 Opc = NVPTX::TLD4_R_2D_U32_F32_RR; 2504 break; 2505 case NVPTXISD::Tld4G2DU64Float: 2506 Opc = NVPTX::TLD4_G_2D_U32_F32_RR; 2507 break; 2508 case NVPTXISD::Tld4B2DU64Float: 2509 Opc = NVPTX::TLD4_B_2D_U32_F32_RR; 2510 break; 2511 case NVPTXISD::Tld4A2DU64Float: 2512 Opc = NVPTX::TLD4_A_2D_U32_F32_RR; 2513 break; 2514 case NVPTXISD::TexUnified1DFloatS32: 2515 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; 2516 break; 2517 case NVPTXISD::TexUnified1DFloatFloat: 2518 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; 2519 break; 2520 case NVPTXISD::TexUnified1DFloatFloatLevel: 2521 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; 2522 break; 2523 case NVPTXISD::TexUnified1DFloatFloatGrad: 2524 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; 2525 break; 2526 case NVPTXISD::TexUnified1DS32S32: 2527 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; 2528 break; 2529 case NVPTXISD::TexUnified1DS32Float: 2530 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; 2531 break; 2532 case NVPTXISD::TexUnified1DS32FloatLevel: 2533 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; 2534 break; 2535 case NVPTXISD::TexUnified1DS32FloatGrad: 2536 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; 2537 break; 2538 case NVPTXISD::TexUnified1DU32S32: 2539 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; 2540 break; 2541 case NVPTXISD::TexUnified1DU32Float: 2542 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; 2543 break; 2544 case NVPTXISD::TexUnified1DU32FloatLevel: 2545 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; 2546 break; 2547 case NVPTXISD::TexUnified1DU32FloatGrad: 2548 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; 2549 break; 2550 case NVPTXISD::TexUnified1DArrayFloatS32: 2551 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; 2552 break; 2553 case NVPTXISD::TexUnified1DArrayFloatFloat: 2554 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; 2555 break; 2556 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 2557 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; 2558 break; 2559 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 2560 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; 2561 break; 2562 case NVPTXISD::TexUnified1DArrayS32S32: 2563 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; 2564 break; 2565 case NVPTXISD::TexUnified1DArrayS32Float: 2566 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; 2567 break; 2568 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 2569 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; 2570 break; 2571 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 2572 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; 2573 break; 2574 case NVPTXISD::TexUnified1DArrayU32S32: 2575 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; 2576 break; 2577 case NVPTXISD::TexUnified1DArrayU32Float: 2578 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; 2579 break; 2580 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 2581 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; 2582 break; 2583 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 2584 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; 2585 break; 2586 case NVPTXISD::TexUnified2DFloatS32: 2587 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; 2588 break; 2589 case NVPTXISD::TexUnified2DFloatFloat: 2590 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; 2591 break; 2592 case NVPTXISD::TexUnified2DFloatFloatLevel: 2593 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; 2594 break; 2595 case NVPTXISD::TexUnified2DFloatFloatGrad: 2596 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; 2597 break; 2598 case NVPTXISD::TexUnified2DS32S32: 2599 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; 2600 break; 2601 case NVPTXISD::TexUnified2DS32Float: 2602 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; 2603 break; 2604 case NVPTXISD::TexUnified2DS32FloatLevel: 2605 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; 2606 break; 2607 case NVPTXISD::TexUnified2DS32FloatGrad: 2608 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; 2609 break; 2610 case NVPTXISD::TexUnified2DU32S32: 2611 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; 2612 break; 2613 case NVPTXISD::TexUnified2DU32Float: 2614 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; 2615 break; 2616 case NVPTXISD::TexUnified2DU32FloatLevel: 2617 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; 2618 break; 2619 case NVPTXISD::TexUnified2DU32FloatGrad: 2620 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; 2621 break; 2622 case NVPTXISD::TexUnified2DArrayFloatS32: 2623 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; 2624 break; 2625 case NVPTXISD::TexUnified2DArrayFloatFloat: 2626 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; 2627 break; 2628 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 2629 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; 2630 break; 2631 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 2632 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; 2633 break; 2634 case NVPTXISD::TexUnified2DArrayS32S32: 2635 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; 2636 break; 2637 case NVPTXISD::TexUnified2DArrayS32Float: 2638 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; 2639 break; 2640 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 2641 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; 2642 break; 2643 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 2644 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; 2645 break; 2646 case NVPTXISD::TexUnified2DArrayU32S32: 2647 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; 2648 break; 2649 case NVPTXISD::TexUnified2DArrayU32Float: 2650 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; 2651 break; 2652 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 2653 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; 2654 break; 2655 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 2656 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; 2657 break; 2658 case NVPTXISD::TexUnified3DFloatS32: 2659 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; 2660 break; 2661 case NVPTXISD::TexUnified3DFloatFloat: 2662 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; 2663 break; 2664 case NVPTXISD::TexUnified3DFloatFloatLevel: 2665 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; 2666 break; 2667 case NVPTXISD::TexUnified3DFloatFloatGrad: 2668 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; 2669 break; 2670 case NVPTXISD::TexUnified3DS32S32: 2671 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; 2672 break; 2673 case NVPTXISD::TexUnified3DS32Float: 2674 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; 2675 break; 2676 case NVPTXISD::TexUnified3DS32FloatLevel: 2677 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; 2678 break; 2679 case NVPTXISD::TexUnified3DS32FloatGrad: 2680 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; 2681 break; 2682 case NVPTXISD::TexUnified3DU32S32: 2683 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; 2684 break; 2685 case NVPTXISD::TexUnified3DU32Float: 2686 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; 2687 break; 2688 case NVPTXISD::TexUnified3DU32FloatLevel: 2689 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; 2690 break; 2691 case NVPTXISD::TexUnified3DU32FloatGrad: 2692 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; 2693 break; 2694 case NVPTXISD::TexUnifiedCubeFloatFloat: 2695 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; 2696 break; 2697 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 2698 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; 2699 break; 2700 case NVPTXISD::TexUnifiedCubeS32Float: 2701 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; 2702 break; 2703 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 2704 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; 2705 break; 2706 case NVPTXISD::TexUnifiedCubeU32Float: 2707 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; 2708 break; 2709 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 2710 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; 2711 break; 2712 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 2713 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; 2714 break; 2715 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 2716 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; 2717 break; 2718 case NVPTXISD::TexUnifiedCubeArrayS32Float: 2719 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; 2720 break; 2721 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 2722 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; 2723 break; 2724 case NVPTXISD::TexUnifiedCubeArrayU32Float: 2725 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; 2726 break; 2727 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 2728 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; 2729 break; 2730 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 2731 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; 2732 break; 2733 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 2734 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; 2735 break; 2736 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 2737 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; 2738 break; 2739 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 2740 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; 2741 break; 2742 case NVPTXISD::Tld4UnifiedR2DS64Float: 2743 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; 2744 break; 2745 case NVPTXISD::Tld4UnifiedG2DS64Float: 2746 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; 2747 break; 2748 case NVPTXISD::Tld4UnifiedB2DS64Float: 2749 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; 2750 break; 2751 case NVPTXISD::Tld4UnifiedA2DS64Float: 2752 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; 2753 break; 2754 case NVPTXISD::Tld4UnifiedR2DU64Float: 2755 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; 2756 break; 2757 case NVPTXISD::Tld4UnifiedG2DU64Float: 2758 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; 2759 break; 2760 case NVPTXISD::Tld4UnifiedB2DU64Float: 2761 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; 2762 break; 2763 case NVPTXISD::Tld4UnifiedA2DU64Float: 2764 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; 2765 break; 2766 } 2767 2768 // Copy over operands 2769 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 2770 Ops.push_back(N->getOperand(0)); // Move chain to the back. 2771 2772 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 2773 return true; 2774 } 2775 2776 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { 2777 unsigned Opc = 0; 2778 switch (N->getOpcode()) { 2779 default: return false; 2780 case NVPTXISD::Suld1DI8Clamp: 2781 Opc = NVPTX::SULD_1D_I8_CLAMP_R; 2782 break; 2783 case NVPTXISD::Suld1DI16Clamp: 2784 Opc = NVPTX::SULD_1D_I16_CLAMP_R; 2785 break; 2786 case NVPTXISD::Suld1DI32Clamp: 2787 Opc = NVPTX::SULD_1D_I32_CLAMP_R; 2788 break; 2789 case NVPTXISD::Suld1DI64Clamp: 2790 Opc = NVPTX::SULD_1D_I64_CLAMP_R; 2791 break; 2792 case NVPTXISD::Suld1DV2I8Clamp: 2793 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; 2794 break; 2795 case NVPTXISD::Suld1DV2I16Clamp: 2796 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; 2797 break; 2798 case NVPTXISD::Suld1DV2I32Clamp: 2799 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; 2800 break; 2801 case NVPTXISD::Suld1DV2I64Clamp: 2802 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; 2803 break; 2804 case NVPTXISD::Suld1DV4I8Clamp: 2805 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; 2806 break; 2807 case NVPTXISD::Suld1DV4I16Clamp: 2808 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; 2809 break; 2810 case NVPTXISD::Suld1DV4I32Clamp: 2811 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; 2812 break; 2813 case NVPTXISD::Suld1DArrayI8Clamp: 2814 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; 2815 break; 2816 case NVPTXISD::Suld1DArrayI16Clamp: 2817 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; 2818 break; 2819 case NVPTXISD::Suld1DArrayI32Clamp: 2820 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; 2821 break; 2822 case NVPTXISD::Suld1DArrayI64Clamp: 2823 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; 2824 break; 2825 case NVPTXISD::Suld1DArrayV2I8Clamp: 2826 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; 2827 break; 2828 case NVPTXISD::Suld1DArrayV2I16Clamp: 2829 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; 2830 break; 2831 case NVPTXISD::Suld1DArrayV2I32Clamp: 2832 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; 2833 break; 2834 case NVPTXISD::Suld1DArrayV2I64Clamp: 2835 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; 2836 break; 2837 case NVPTXISD::Suld1DArrayV4I8Clamp: 2838 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; 2839 break; 2840 case NVPTXISD::Suld1DArrayV4I16Clamp: 2841 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; 2842 break; 2843 case NVPTXISD::Suld1DArrayV4I32Clamp: 2844 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; 2845 break; 2846 case NVPTXISD::Suld2DI8Clamp: 2847 Opc = NVPTX::SULD_2D_I8_CLAMP_R; 2848 break; 2849 case NVPTXISD::Suld2DI16Clamp: 2850 Opc = NVPTX::SULD_2D_I16_CLAMP_R; 2851 break; 2852 case NVPTXISD::Suld2DI32Clamp: 2853 Opc = NVPTX::SULD_2D_I32_CLAMP_R; 2854 break; 2855 case NVPTXISD::Suld2DI64Clamp: 2856 Opc = NVPTX::SULD_2D_I64_CLAMP_R; 2857 break; 2858 case NVPTXISD::Suld2DV2I8Clamp: 2859 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; 2860 break; 2861 case NVPTXISD::Suld2DV2I16Clamp: 2862 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; 2863 break; 2864 case NVPTXISD::Suld2DV2I32Clamp: 2865 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; 2866 break; 2867 case NVPTXISD::Suld2DV2I64Clamp: 2868 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; 2869 break; 2870 case NVPTXISD::Suld2DV4I8Clamp: 2871 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; 2872 break; 2873 case NVPTXISD::Suld2DV4I16Clamp: 2874 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; 2875 break; 2876 case NVPTXISD::Suld2DV4I32Clamp: 2877 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; 2878 break; 2879 case NVPTXISD::Suld2DArrayI8Clamp: 2880 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; 2881 break; 2882 case NVPTXISD::Suld2DArrayI16Clamp: 2883 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; 2884 break; 2885 case NVPTXISD::Suld2DArrayI32Clamp: 2886 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; 2887 break; 2888 case NVPTXISD::Suld2DArrayI64Clamp: 2889 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; 2890 break; 2891 case NVPTXISD::Suld2DArrayV2I8Clamp: 2892 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; 2893 break; 2894 case NVPTXISD::Suld2DArrayV2I16Clamp: 2895 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; 2896 break; 2897 case NVPTXISD::Suld2DArrayV2I32Clamp: 2898 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; 2899 break; 2900 case NVPTXISD::Suld2DArrayV2I64Clamp: 2901 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; 2902 break; 2903 case NVPTXISD::Suld2DArrayV4I8Clamp: 2904 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; 2905 break; 2906 case NVPTXISD::Suld2DArrayV4I16Clamp: 2907 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; 2908 break; 2909 case NVPTXISD::Suld2DArrayV4I32Clamp: 2910 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; 2911 break; 2912 case NVPTXISD::Suld3DI8Clamp: 2913 Opc = NVPTX::SULD_3D_I8_CLAMP_R; 2914 break; 2915 case NVPTXISD::Suld3DI16Clamp: 2916 Opc = NVPTX::SULD_3D_I16_CLAMP_R; 2917 break; 2918 case NVPTXISD::Suld3DI32Clamp: 2919 Opc = NVPTX::SULD_3D_I32_CLAMP_R; 2920 break; 2921 case NVPTXISD::Suld3DI64Clamp: 2922 Opc = NVPTX::SULD_3D_I64_CLAMP_R; 2923 break; 2924 case NVPTXISD::Suld3DV2I8Clamp: 2925 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; 2926 break; 2927 case NVPTXISD::Suld3DV2I16Clamp: 2928 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; 2929 break; 2930 case NVPTXISD::Suld3DV2I32Clamp: 2931 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; 2932 break; 2933 case NVPTXISD::Suld3DV2I64Clamp: 2934 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; 2935 break; 2936 case NVPTXISD::Suld3DV4I8Clamp: 2937 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; 2938 break; 2939 case NVPTXISD::Suld3DV4I16Clamp: 2940 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; 2941 break; 2942 case NVPTXISD::Suld3DV4I32Clamp: 2943 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; 2944 break; 2945 case NVPTXISD::Suld1DI8Trap: 2946 Opc = NVPTX::SULD_1D_I8_TRAP_R; 2947 break; 2948 case NVPTXISD::Suld1DI16Trap: 2949 Opc = NVPTX::SULD_1D_I16_TRAP_R; 2950 break; 2951 case NVPTXISD::Suld1DI32Trap: 2952 Opc = NVPTX::SULD_1D_I32_TRAP_R; 2953 break; 2954 case NVPTXISD::Suld1DI64Trap: 2955 Opc = NVPTX::SULD_1D_I64_TRAP_R; 2956 break; 2957 case NVPTXISD::Suld1DV2I8Trap: 2958 Opc = NVPTX::SULD_1D_V2I8_TRAP_R; 2959 break; 2960 case NVPTXISD::Suld1DV2I16Trap: 2961 Opc = NVPTX::SULD_1D_V2I16_TRAP_R; 2962 break; 2963 case NVPTXISD::Suld1DV2I32Trap: 2964 Opc = NVPTX::SULD_1D_V2I32_TRAP_R; 2965 break; 2966 case NVPTXISD::Suld1DV2I64Trap: 2967 Opc = NVPTX::SULD_1D_V2I64_TRAP_R; 2968 break; 2969 case NVPTXISD::Suld1DV4I8Trap: 2970 Opc = NVPTX::SULD_1D_V4I8_TRAP_R; 2971 break; 2972 case NVPTXISD::Suld1DV4I16Trap: 2973 Opc = NVPTX::SULD_1D_V4I16_TRAP_R; 2974 break; 2975 case NVPTXISD::Suld1DV4I32Trap: 2976 Opc = NVPTX::SULD_1D_V4I32_TRAP_R; 2977 break; 2978 case NVPTXISD::Suld1DArrayI8Trap: 2979 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; 2980 break; 2981 case NVPTXISD::Suld1DArrayI16Trap: 2982 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; 2983 break; 2984 case NVPTXISD::Suld1DArrayI32Trap: 2985 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; 2986 break; 2987 case NVPTXISD::Suld1DArrayI64Trap: 2988 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; 2989 break; 2990 case NVPTXISD::Suld1DArrayV2I8Trap: 2991 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; 2992 break; 2993 case NVPTXISD::Suld1DArrayV2I16Trap: 2994 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; 2995 break; 2996 case NVPTXISD::Suld1DArrayV2I32Trap: 2997 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; 2998 break; 2999 case NVPTXISD::Suld1DArrayV2I64Trap: 3000 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; 3001 break; 3002 case NVPTXISD::Suld1DArrayV4I8Trap: 3003 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; 3004 break; 3005 case NVPTXISD::Suld1DArrayV4I16Trap: 3006 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; 3007 break; 3008 case NVPTXISD::Suld1DArrayV4I32Trap: 3009 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; 3010 break; 3011 case NVPTXISD::Suld2DI8Trap: 3012 Opc = NVPTX::SULD_2D_I8_TRAP_R; 3013 break; 3014 case NVPTXISD::Suld2DI16Trap: 3015 Opc = NVPTX::SULD_2D_I16_TRAP_R; 3016 break; 3017 case NVPTXISD::Suld2DI32Trap: 3018 Opc = NVPTX::SULD_2D_I32_TRAP_R; 3019 break; 3020 case NVPTXISD::Suld2DI64Trap: 3021 Opc = NVPTX::SULD_2D_I64_TRAP_R; 3022 break; 3023 case NVPTXISD::Suld2DV2I8Trap: 3024 Opc = NVPTX::SULD_2D_V2I8_TRAP_R; 3025 break; 3026 case NVPTXISD::Suld2DV2I16Trap: 3027 Opc = NVPTX::SULD_2D_V2I16_TRAP_R; 3028 break; 3029 case NVPTXISD::Suld2DV2I32Trap: 3030 Opc = NVPTX::SULD_2D_V2I32_TRAP_R; 3031 break; 3032 case NVPTXISD::Suld2DV2I64Trap: 3033 Opc = NVPTX::SULD_2D_V2I64_TRAP_R; 3034 break; 3035 case NVPTXISD::Suld2DV4I8Trap: 3036 Opc = NVPTX::SULD_2D_V4I8_TRAP_R; 3037 break; 3038 case NVPTXISD::Suld2DV4I16Trap: 3039 Opc = NVPTX::SULD_2D_V4I16_TRAP_R; 3040 break; 3041 case NVPTXISD::Suld2DV4I32Trap: 3042 Opc = NVPTX::SULD_2D_V4I32_TRAP_R; 3043 break; 3044 case NVPTXISD::Suld2DArrayI8Trap: 3045 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; 3046 break; 3047 case NVPTXISD::Suld2DArrayI16Trap: 3048 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; 3049 break; 3050 case NVPTXISD::Suld2DArrayI32Trap: 3051 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; 3052 break; 3053 case NVPTXISD::Suld2DArrayI64Trap: 3054 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; 3055 break; 3056 case NVPTXISD::Suld2DArrayV2I8Trap: 3057 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; 3058 break; 3059 case NVPTXISD::Suld2DArrayV2I16Trap: 3060 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; 3061 break; 3062 case NVPTXISD::Suld2DArrayV2I32Trap: 3063 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; 3064 break; 3065 case NVPTXISD::Suld2DArrayV2I64Trap: 3066 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; 3067 break; 3068 case NVPTXISD::Suld2DArrayV4I8Trap: 3069 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; 3070 break; 3071 case NVPTXISD::Suld2DArrayV4I16Trap: 3072 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; 3073 break; 3074 case NVPTXISD::Suld2DArrayV4I32Trap: 3075 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; 3076 break; 3077 case NVPTXISD::Suld3DI8Trap: 3078 Opc = NVPTX::SULD_3D_I8_TRAP_R; 3079 break; 3080 case NVPTXISD::Suld3DI16Trap: 3081 Opc = NVPTX::SULD_3D_I16_TRAP_R; 3082 break; 3083 case NVPTXISD::Suld3DI32Trap: 3084 Opc = NVPTX::SULD_3D_I32_TRAP_R; 3085 break; 3086 case NVPTXISD::Suld3DI64Trap: 3087 Opc = NVPTX::SULD_3D_I64_TRAP_R; 3088 break; 3089 case NVPTXISD::Suld3DV2I8Trap: 3090 Opc = NVPTX::SULD_3D_V2I8_TRAP_R; 3091 break; 3092 case NVPTXISD::Suld3DV2I16Trap: 3093 Opc = NVPTX::SULD_3D_V2I16_TRAP_R; 3094 break; 3095 case NVPTXISD::Suld3DV2I32Trap: 3096 Opc = NVPTX::SULD_3D_V2I32_TRAP_R; 3097 break; 3098 case NVPTXISD::Suld3DV2I64Trap: 3099 Opc = NVPTX::SULD_3D_V2I64_TRAP_R; 3100 break; 3101 case NVPTXISD::Suld3DV4I8Trap: 3102 Opc = NVPTX::SULD_3D_V4I8_TRAP_R; 3103 break; 3104 case NVPTXISD::Suld3DV4I16Trap: 3105 Opc = NVPTX::SULD_3D_V4I16_TRAP_R; 3106 break; 3107 case NVPTXISD::Suld3DV4I32Trap: 3108 Opc = NVPTX::SULD_3D_V4I32_TRAP_R; 3109 break; 3110 case NVPTXISD::Suld1DI8Zero: 3111 Opc = NVPTX::SULD_1D_I8_ZERO_R; 3112 break; 3113 case NVPTXISD::Suld1DI16Zero: 3114 Opc = NVPTX::SULD_1D_I16_ZERO_R; 3115 break; 3116 case NVPTXISD::Suld1DI32Zero: 3117 Opc = NVPTX::SULD_1D_I32_ZERO_R; 3118 break; 3119 case NVPTXISD::Suld1DI64Zero: 3120 Opc = NVPTX::SULD_1D_I64_ZERO_R; 3121 break; 3122 case NVPTXISD::Suld1DV2I8Zero: 3123 Opc = NVPTX::SULD_1D_V2I8_ZERO_R; 3124 break; 3125 case NVPTXISD::Suld1DV2I16Zero: 3126 Opc = NVPTX::SULD_1D_V2I16_ZERO_R; 3127 break; 3128 case NVPTXISD::Suld1DV2I32Zero: 3129 Opc = NVPTX::SULD_1D_V2I32_ZERO_R; 3130 break; 3131 case NVPTXISD::Suld1DV2I64Zero: 3132 Opc = NVPTX::SULD_1D_V2I64_ZERO_R; 3133 break; 3134 case NVPTXISD::Suld1DV4I8Zero: 3135 Opc = NVPTX::SULD_1D_V4I8_ZERO_R; 3136 break; 3137 case NVPTXISD::Suld1DV4I16Zero: 3138 Opc = NVPTX::SULD_1D_V4I16_ZERO_R; 3139 break; 3140 case NVPTXISD::Suld1DV4I32Zero: 3141 Opc = NVPTX::SULD_1D_V4I32_ZERO_R; 3142 break; 3143 case NVPTXISD::Suld1DArrayI8Zero: 3144 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; 3145 break; 3146 case NVPTXISD::Suld1DArrayI16Zero: 3147 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; 3148 break; 3149 case NVPTXISD::Suld1DArrayI32Zero: 3150 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; 3151 break; 3152 case NVPTXISD::Suld1DArrayI64Zero: 3153 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; 3154 break; 3155 case NVPTXISD::Suld1DArrayV2I8Zero: 3156 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; 3157 break; 3158 case NVPTXISD::Suld1DArrayV2I16Zero: 3159 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; 3160 break; 3161 case NVPTXISD::Suld1DArrayV2I32Zero: 3162 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; 3163 break; 3164 case NVPTXISD::Suld1DArrayV2I64Zero: 3165 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; 3166 break; 3167 case NVPTXISD::Suld1DArrayV4I8Zero: 3168 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; 3169 break; 3170 case NVPTXISD::Suld1DArrayV4I16Zero: 3171 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; 3172 break; 3173 case NVPTXISD::Suld1DArrayV4I32Zero: 3174 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; 3175 break; 3176 case NVPTXISD::Suld2DI8Zero: 3177 Opc = NVPTX::SULD_2D_I8_ZERO_R; 3178 break; 3179 case NVPTXISD::Suld2DI16Zero: 3180 Opc = NVPTX::SULD_2D_I16_ZERO_R; 3181 break; 3182 case NVPTXISD::Suld2DI32Zero: 3183 Opc = NVPTX::SULD_2D_I32_ZERO_R; 3184 break; 3185 case NVPTXISD::Suld2DI64Zero: 3186 Opc = NVPTX::SULD_2D_I64_ZERO_R; 3187 break; 3188 case NVPTXISD::Suld2DV2I8Zero: 3189 Opc = NVPTX::SULD_2D_V2I8_ZERO_R; 3190 break; 3191 case NVPTXISD::Suld2DV2I16Zero: 3192 Opc = NVPTX::SULD_2D_V2I16_ZERO_R; 3193 break; 3194 case NVPTXISD::Suld2DV2I32Zero: 3195 Opc = NVPTX::SULD_2D_V2I32_ZERO_R; 3196 break; 3197 case NVPTXISD::Suld2DV2I64Zero: 3198 Opc = NVPTX::SULD_2D_V2I64_ZERO_R; 3199 break; 3200 case NVPTXISD::Suld2DV4I8Zero: 3201 Opc = NVPTX::SULD_2D_V4I8_ZERO_R; 3202 break; 3203 case NVPTXISD::Suld2DV4I16Zero: 3204 Opc = NVPTX::SULD_2D_V4I16_ZERO_R; 3205 break; 3206 case NVPTXISD::Suld2DV4I32Zero: 3207 Opc = NVPTX::SULD_2D_V4I32_ZERO_R; 3208 break; 3209 case NVPTXISD::Suld2DArrayI8Zero: 3210 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; 3211 break; 3212 case NVPTXISD::Suld2DArrayI16Zero: 3213 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; 3214 break; 3215 case NVPTXISD::Suld2DArrayI32Zero: 3216 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; 3217 break; 3218 case NVPTXISD::Suld2DArrayI64Zero: 3219 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; 3220 break; 3221 case NVPTXISD::Suld2DArrayV2I8Zero: 3222 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; 3223 break; 3224 case NVPTXISD::Suld2DArrayV2I16Zero: 3225 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; 3226 break; 3227 case NVPTXISD::Suld2DArrayV2I32Zero: 3228 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; 3229 break; 3230 case NVPTXISD::Suld2DArrayV2I64Zero: 3231 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; 3232 break; 3233 case NVPTXISD::Suld2DArrayV4I8Zero: 3234 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; 3235 break; 3236 case NVPTXISD::Suld2DArrayV4I16Zero: 3237 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; 3238 break; 3239 case NVPTXISD::Suld2DArrayV4I32Zero: 3240 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; 3241 break; 3242 case NVPTXISD::Suld3DI8Zero: 3243 Opc = NVPTX::SULD_3D_I8_ZERO_R; 3244 break; 3245 case NVPTXISD::Suld3DI16Zero: 3246 Opc = NVPTX::SULD_3D_I16_ZERO_R; 3247 break; 3248 case NVPTXISD::Suld3DI32Zero: 3249 Opc = NVPTX::SULD_3D_I32_ZERO_R; 3250 break; 3251 case NVPTXISD::Suld3DI64Zero: 3252 Opc = NVPTX::SULD_3D_I64_ZERO_R; 3253 break; 3254 case NVPTXISD::Suld3DV2I8Zero: 3255 Opc = NVPTX::SULD_3D_V2I8_ZERO_R; 3256 break; 3257 case NVPTXISD::Suld3DV2I16Zero: 3258 Opc = NVPTX::SULD_3D_V2I16_ZERO_R; 3259 break; 3260 case NVPTXISD::Suld3DV2I32Zero: 3261 Opc = NVPTX::SULD_3D_V2I32_ZERO_R; 3262 break; 3263 case NVPTXISD::Suld3DV2I64Zero: 3264 Opc = NVPTX::SULD_3D_V2I64_ZERO_R; 3265 break; 3266 case NVPTXISD::Suld3DV4I8Zero: 3267 Opc = NVPTX::SULD_3D_V4I8_ZERO_R; 3268 break; 3269 case NVPTXISD::Suld3DV4I16Zero: 3270 Opc = NVPTX::SULD_3D_V4I16_ZERO_R; 3271 break; 3272 case NVPTXISD::Suld3DV4I32Zero: 3273 Opc = NVPTX::SULD_3D_V4I32_ZERO_R; 3274 break; 3275 } 3276 3277 // Copy over operands 3278 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 3279 Ops.push_back(N->getOperand(0)); // Move chain to the back. 3280 3281 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 3282 return true; 3283 } 3284 3285 3286 /// SelectBFE - Look for instruction sequences that can be made more efficient 3287 /// by using the 'bfe' (bit-field extract) PTX instruction 3288 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { 3289 SDLoc DL(N); 3290 SDValue LHS = N->getOperand(0); 3291 SDValue RHS = N->getOperand(1); 3292 SDValue Len; 3293 SDValue Start; 3294 SDValue Val; 3295 bool IsSigned = false; 3296 3297 if (N->getOpcode() == ISD::AND) { 3298 // Canonicalize the operands 3299 // We want 'and %val, %mask' 3300 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { 3301 std::swap(LHS, RHS); 3302 } 3303 3304 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); 3305 if (!Mask) { 3306 // We need a constant mask on the RHS of the AND 3307 return false; 3308 } 3309 3310 // Extract the mask bits 3311 uint64_t MaskVal = Mask->getZExtValue(); 3312 if (!isMask_64(MaskVal)) { 3313 // We *could* handle shifted masks here, but doing so would require an 3314 // 'and' operation to fix up the low-order bits so we would trade 3315 // shr+and for bfe+and, which has the same throughput 3316 return false; 3317 } 3318 3319 // How many bits are in our mask? 3320 int64_t NumBits = countr_one(MaskVal); 3321 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3322 3323 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { 3324 // We have a 'srl/and' pair, extract the effective start bit and length 3325 Val = LHS.getNode()->getOperand(0); 3326 Start = LHS.getNode()->getOperand(1); 3327 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); 3328 if (StartConst) { 3329 uint64_t StartVal = StartConst->getZExtValue(); 3330 // How many "good" bits do we have left? "good" is defined here as bits 3331 // that exist in the original value, not shifted in. 3332 int64_t GoodBits = Start.getValueSizeInBits() - StartVal; 3333 if (NumBits > GoodBits) { 3334 // Do not handle the case where bits have been shifted in. In theory 3335 // we could handle this, but the cost is likely higher than just 3336 // emitting the srl/and pair. 3337 return false; 3338 } 3339 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); 3340 } else { 3341 // Do not handle the case where the shift amount (can be zero if no srl 3342 // was found) is not constant. We could handle this case, but it would 3343 // require run-time logic that would be more expensive than just 3344 // emitting the srl/and pair. 3345 return false; 3346 } 3347 } else { 3348 // Do not handle the case where the LHS of the and is not a shift. While 3349 // it would be trivial to handle this case, it would just transform 3350 // 'and' -> 'bfe', but 'and' has higher-throughput. 3351 return false; 3352 } 3353 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { 3354 if (LHS->getOpcode() == ISD::AND) { 3355 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); 3356 if (!ShiftCnst) { 3357 // Shift amount must be constant 3358 return false; 3359 } 3360 3361 uint64_t ShiftAmt = ShiftCnst->getZExtValue(); 3362 3363 SDValue AndLHS = LHS->getOperand(0); 3364 SDValue AndRHS = LHS->getOperand(1); 3365 3366 // Canonicalize the AND to have the mask on the RHS 3367 if (isa<ConstantSDNode>(AndLHS)) { 3368 std::swap(AndLHS, AndRHS); 3369 } 3370 3371 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); 3372 if (!MaskCnst) { 3373 // Mask must be constant 3374 return false; 3375 } 3376 3377 uint64_t MaskVal = MaskCnst->getZExtValue(); 3378 uint64_t NumZeros; 3379 uint64_t NumBits; 3380 if (isMask_64(MaskVal)) { 3381 NumZeros = 0; 3382 // The number of bits in the result bitfield will be the number of 3383 // trailing ones (the AND) minus the number of bits we shift off 3384 NumBits = llvm::countr_one(MaskVal) - ShiftAmt; 3385 } else if (isShiftedMask_64(MaskVal)) { 3386 NumZeros = llvm::countr_zero(MaskVal); 3387 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros); 3388 // The number of bits in the result bitfield will be the number of 3389 // trailing zeros plus the number of set bits in the mask minus the 3390 // number of bits we shift off 3391 NumBits = NumZeros + NumOnes - ShiftAmt; 3392 } else { 3393 // This is not a mask we can handle 3394 return false; 3395 } 3396 3397 if (ShiftAmt < NumZeros) { 3398 // Handling this case would require extra logic that would make this 3399 // transformation non-profitable 3400 return false; 3401 } 3402 3403 Val = AndLHS; 3404 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); 3405 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3406 } else if (LHS->getOpcode() == ISD::SHL) { 3407 // Here, we have a pattern like: 3408 // 3409 // (sra (shl val, NN), MM) 3410 // or 3411 // (srl (shl val, NN), MM) 3412 // 3413 // If MM >= NN, we can efficiently optimize this with bfe 3414 Val = LHS->getOperand(0); 3415 3416 SDValue ShlRHS = LHS->getOperand(1); 3417 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); 3418 if (!ShlCnst) { 3419 // Shift amount must be constant 3420 return false; 3421 } 3422 uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); 3423 3424 SDValue ShrRHS = RHS; 3425 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); 3426 if (!ShrCnst) { 3427 // Shift amount must be constant 3428 return false; 3429 } 3430 uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); 3431 3432 // To avoid extra codegen and be profitable, we need Outer >= Inner 3433 if (OuterShiftAmt < InnerShiftAmt) { 3434 return false; 3435 } 3436 3437 // If the outer shift is more than the type size, we have no bitfield to 3438 // extract (since we also check that the inner shift is <= the outer shift 3439 // then this also implies that the inner shift is < the type size) 3440 if (OuterShiftAmt >= Val.getValueSizeInBits()) { 3441 return false; 3442 } 3443 3444 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, 3445 MVT::i32); 3446 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, 3447 DL, MVT::i32); 3448 3449 if (N->getOpcode() == ISD::SRA) { 3450 // If we have a arithmetic right shift, we need to use the signed bfe 3451 // variant 3452 IsSigned = true; 3453 } 3454 } else { 3455 // No can do... 3456 return false; 3457 } 3458 } else { 3459 // No can do... 3460 return false; 3461 } 3462 3463 3464 unsigned Opc; 3465 // For the BFE operations we form here from "and" and "srl", always use the 3466 // unsigned variants. 3467 if (Val.getValueType() == MVT::i32) { 3468 if (IsSigned) { 3469 Opc = NVPTX::BFE_S32rii; 3470 } else { 3471 Opc = NVPTX::BFE_U32rii; 3472 } 3473 } else if (Val.getValueType() == MVT::i64) { 3474 if (IsSigned) { 3475 Opc = NVPTX::BFE_S64rii; 3476 } else { 3477 Opc = NVPTX::BFE_U64rii; 3478 } 3479 } else { 3480 // We cannot handle this type 3481 return false; 3482 } 3483 3484 SDValue Ops[] = { 3485 Val, Start, Len 3486 }; 3487 3488 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); 3489 return true; 3490 } 3491 3492 // SelectDirectAddr - Match a direct address for DAG. 3493 // A direct address could be a globaladdress or externalsymbol. 3494 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { 3495 // Return true if TGA or ES. 3496 if (N.getOpcode() == ISD::TargetGlobalAddress || 3497 N.getOpcode() == ISD::TargetExternalSymbol) { 3498 Address = N; 3499 return true; 3500 } 3501 if (N.getOpcode() == NVPTXISD::Wrapper) { 3502 Address = N.getOperand(0); 3503 return true; 3504 } 3505 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol 3506 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) { 3507 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && 3508 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && 3509 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) 3510 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); 3511 } 3512 return false; 3513 } 3514 3515 // symbol+offset 3516 bool NVPTXDAGToDAGISel::SelectADDRsi_imp( 3517 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3518 if (Addr.getOpcode() == ISD::ADD) { 3519 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3520 SDValue base = Addr.getOperand(0); 3521 if (SelectDirectAddr(base, Base)) { 3522 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3523 mvt); 3524 return true; 3525 } 3526 } 3527 } 3528 return false; 3529 } 3530 3531 // symbol+offset 3532 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, 3533 SDValue &Base, SDValue &Offset) { 3534 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); 3535 } 3536 3537 // symbol+offset 3538 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, 3539 SDValue &Base, SDValue &Offset) { 3540 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); 3541 } 3542 3543 // register+offset 3544 bool NVPTXDAGToDAGISel::SelectADDRri_imp( 3545 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3546 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { 3547 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3548 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt); 3549 return true; 3550 } 3551 if (Addr.getOpcode() == ISD::TargetExternalSymbol || 3552 Addr.getOpcode() == ISD::TargetGlobalAddress) 3553 return false; // direct calls. 3554 3555 if (Addr.getOpcode() == ISD::ADD) { 3556 if (SelectDirectAddr(Addr.getOperand(0), Addr)) { 3557 return false; 3558 } 3559 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3560 if (FrameIndexSDNode *FIN = 3561 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) 3562 // Constant offset from frame ref. 3563 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3564 else 3565 Base = Addr.getOperand(0); 3566 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3567 mvt); 3568 return true; 3569 } 3570 } 3571 return false; 3572 } 3573 3574 // register+offset 3575 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, 3576 SDValue &Base, SDValue &Offset) { 3577 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); 3578 } 3579 3580 // register+offset 3581 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, 3582 SDValue &Base, SDValue &Offset) { 3583 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); 3584 } 3585 3586 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, 3587 unsigned int spN) const { 3588 const Value *Src = nullptr; 3589 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { 3590 if (spN == 0 && mN->getMemOperand()->getPseudoValue()) 3591 return true; 3592 Src = mN->getMemOperand()->getValue(); 3593 } 3594 if (!Src) 3595 return false; 3596 if (auto *PT = dyn_cast<PointerType>(Src->getType())) 3597 return (PT->getAddressSpace() == spN); 3598 return false; 3599 } 3600 3601 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 3602 /// inline asm expressions. 3603 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( 3604 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, 3605 std::vector<SDValue> &OutOps) { 3606 SDValue Op0, Op1; 3607 switch (ConstraintID) { 3608 default: 3609 return true; 3610 case InlineAsm::ConstraintCode::m: // memory 3611 if (SelectDirectAddr(Op, Op0)) { 3612 OutOps.push_back(Op0); 3613 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); 3614 return false; 3615 } 3616 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { 3617 OutOps.push_back(Op0); 3618 OutOps.push_back(Op1); 3619 return false; 3620 } 3621 break; 3622 } 3623 return true; 3624 } 3625 3626 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a 3627 /// conversion from \p SrcTy to \p DestTy. 3628 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, 3629 LoadSDNode *LdNode) { 3630 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; 3631 switch (SrcTy.SimpleTy) { 3632 default: 3633 llvm_unreachable("Unhandled source type"); 3634 case MVT::i8: 3635 switch (DestTy.SimpleTy) { 3636 default: 3637 llvm_unreachable("Unhandled dest type"); 3638 case MVT::i16: 3639 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; 3640 case MVT::i32: 3641 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; 3642 case MVT::i64: 3643 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; 3644 } 3645 case MVT::i16: 3646 switch (DestTy.SimpleTy) { 3647 default: 3648 llvm_unreachable("Unhandled dest type"); 3649 case MVT::i8: 3650 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; 3651 case MVT::i32: 3652 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; 3653 case MVT::i64: 3654 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; 3655 } 3656 case MVT::i32: 3657 switch (DestTy.SimpleTy) { 3658 default: 3659 llvm_unreachable("Unhandled dest type"); 3660 case MVT::i8: 3661 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; 3662 case MVT::i16: 3663 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; 3664 case MVT::i64: 3665 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; 3666 } 3667 case MVT::i64: 3668 switch (DestTy.SimpleTy) { 3669 default: 3670 llvm_unreachable("Unhandled dest type"); 3671 case MVT::i8: 3672 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; 3673 case MVT::i16: 3674 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; 3675 case MVT::i32: 3676 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; 3677 } 3678 case MVT::f16: 3679 switch (DestTy.SimpleTy) { 3680 default: 3681 llvm_unreachable("Unhandled dest type"); 3682 case MVT::f32: 3683 return NVPTX::CVT_f32_f16; 3684 case MVT::f64: 3685 return NVPTX::CVT_f64_f16; 3686 } 3687 } 3688 } 3689