1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines an instruction selector for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXISelDAGToDAG.h" 14 #include "MCTargetDesc/NVPTXBaseInfo.h" 15 #include "NVPTXUtilities.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/IR/GlobalValue.h" 18 #include "llvm/IR/Instructions.h" 19 #include "llvm/IR/IntrinsicsNVPTX.h" 20 #include "llvm/Support/AtomicOrdering.h" 21 #include "llvm/Support/CommandLine.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/ErrorHandling.h" 24 #include "llvm/Support/raw_ostream.h" 25 #include "llvm/Target/TargetIntrinsicInfo.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "nvptx-isel" 30 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" 31 32 /// createNVPTXISelDag - This pass converts a legalized DAG into a 33 /// NVPTX-specific DAG, ready for instruction scheduling. 34 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, 35 llvm::CodeGenOpt::Level OptLevel) { 36 return new NVPTXDAGToDAGISel(TM, OptLevel); 37 } 38 39 char NVPTXDAGToDAGISel::ID = 0; 40 41 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) 42 43 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, 44 CodeGenOpt::Level OptLevel) 45 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) { 46 doMulWide = (OptLevel > 0); 47 } 48 49 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 50 Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); 51 return SelectionDAGISel::runOnMachineFunction(MF); 52 } 53 54 int NVPTXDAGToDAGISel::getDivF32Level() const { 55 return Subtarget->getTargetLowering()->getDivF32Level(); 56 } 57 58 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { 59 return Subtarget->getTargetLowering()->usePrecSqrtF32(); 60 } 61 62 bool NVPTXDAGToDAGISel::useF32FTZ() const { 63 return Subtarget->getTargetLowering()->useF32FTZ(*MF); 64 } 65 66 bool NVPTXDAGToDAGISel::allowFMA() const { 67 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 68 return TL->allowFMA(*MF, OptLevel); 69 } 70 71 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { 72 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 73 return TL->allowUnsafeFPMath(*MF); 74 } 75 76 bool NVPTXDAGToDAGISel::useShortPointers() const { 77 return TM.useShortPointers(); 78 } 79 80 /// Select - Select instructions not customized! Used for 81 /// expanded, promoted and normal instructions. 82 void NVPTXDAGToDAGISel::Select(SDNode *N) { 83 84 if (N->isMachineOpcode()) { 85 N->setNodeId(-1); 86 return; // Already selected. 87 } 88 89 switch (N->getOpcode()) { 90 case ISD::LOAD: 91 case ISD::ATOMIC_LOAD: 92 if (tryLoad(N)) 93 return; 94 break; 95 case ISD::STORE: 96 case ISD::ATOMIC_STORE: 97 if (tryStore(N)) 98 return; 99 break; 100 case ISD::EXTRACT_VECTOR_ELT: 101 if (tryEXTRACT_VECTOR_ELEMENT(N)) 102 return; 103 break; 104 case NVPTXISD::SETP_F16X2: 105 SelectSETP_F16X2(N); 106 return; 107 108 case NVPTXISD::LoadV2: 109 case NVPTXISD::LoadV4: 110 if (tryLoadVector(N)) 111 return; 112 break; 113 case NVPTXISD::LDGV2: 114 case NVPTXISD::LDGV4: 115 case NVPTXISD::LDUV2: 116 case NVPTXISD::LDUV4: 117 if (tryLDGLDU(N)) 118 return; 119 break; 120 case NVPTXISD::StoreV2: 121 case NVPTXISD::StoreV4: 122 if (tryStoreVector(N)) 123 return; 124 break; 125 case NVPTXISD::LoadParam: 126 case NVPTXISD::LoadParamV2: 127 case NVPTXISD::LoadParamV4: 128 if (tryLoadParam(N)) 129 return; 130 break; 131 case NVPTXISD::StoreRetval: 132 case NVPTXISD::StoreRetvalV2: 133 case NVPTXISD::StoreRetvalV4: 134 if (tryStoreRetval(N)) 135 return; 136 break; 137 case NVPTXISD::StoreParam: 138 case NVPTXISD::StoreParamV2: 139 case NVPTXISD::StoreParamV4: 140 case NVPTXISD::StoreParamS32: 141 case NVPTXISD::StoreParamU32: 142 if (tryStoreParam(N)) 143 return; 144 break; 145 case ISD::INTRINSIC_WO_CHAIN: 146 if (tryIntrinsicNoChain(N)) 147 return; 148 break; 149 case ISD::INTRINSIC_W_CHAIN: 150 if (tryIntrinsicChain(N)) 151 return; 152 break; 153 case NVPTXISD::Tex1DFloatS32: 154 case NVPTXISD::Tex1DFloatFloat: 155 case NVPTXISD::Tex1DFloatFloatLevel: 156 case NVPTXISD::Tex1DFloatFloatGrad: 157 case NVPTXISD::Tex1DS32S32: 158 case NVPTXISD::Tex1DS32Float: 159 case NVPTXISD::Tex1DS32FloatLevel: 160 case NVPTXISD::Tex1DS32FloatGrad: 161 case NVPTXISD::Tex1DU32S32: 162 case NVPTXISD::Tex1DU32Float: 163 case NVPTXISD::Tex1DU32FloatLevel: 164 case NVPTXISD::Tex1DU32FloatGrad: 165 case NVPTXISD::Tex1DArrayFloatS32: 166 case NVPTXISD::Tex1DArrayFloatFloat: 167 case NVPTXISD::Tex1DArrayFloatFloatLevel: 168 case NVPTXISD::Tex1DArrayFloatFloatGrad: 169 case NVPTXISD::Tex1DArrayS32S32: 170 case NVPTXISD::Tex1DArrayS32Float: 171 case NVPTXISD::Tex1DArrayS32FloatLevel: 172 case NVPTXISD::Tex1DArrayS32FloatGrad: 173 case NVPTXISD::Tex1DArrayU32S32: 174 case NVPTXISD::Tex1DArrayU32Float: 175 case NVPTXISD::Tex1DArrayU32FloatLevel: 176 case NVPTXISD::Tex1DArrayU32FloatGrad: 177 case NVPTXISD::Tex2DFloatS32: 178 case NVPTXISD::Tex2DFloatFloat: 179 case NVPTXISD::Tex2DFloatFloatLevel: 180 case NVPTXISD::Tex2DFloatFloatGrad: 181 case NVPTXISD::Tex2DS32S32: 182 case NVPTXISD::Tex2DS32Float: 183 case NVPTXISD::Tex2DS32FloatLevel: 184 case NVPTXISD::Tex2DS32FloatGrad: 185 case NVPTXISD::Tex2DU32S32: 186 case NVPTXISD::Tex2DU32Float: 187 case NVPTXISD::Tex2DU32FloatLevel: 188 case NVPTXISD::Tex2DU32FloatGrad: 189 case NVPTXISD::Tex2DArrayFloatS32: 190 case NVPTXISD::Tex2DArrayFloatFloat: 191 case NVPTXISD::Tex2DArrayFloatFloatLevel: 192 case NVPTXISD::Tex2DArrayFloatFloatGrad: 193 case NVPTXISD::Tex2DArrayS32S32: 194 case NVPTXISD::Tex2DArrayS32Float: 195 case NVPTXISD::Tex2DArrayS32FloatLevel: 196 case NVPTXISD::Tex2DArrayS32FloatGrad: 197 case NVPTXISD::Tex2DArrayU32S32: 198 case NVPTXISD::Tex2DArrayU32Float: 199 case NVPTXISD::Tex2DArrayU32FloatLevel: 200 case NVPTXISD::Tex2DArrayU32FloatGrad: 201 case NVPTXISD::Tex3DFloatS32: 202 case NVPTXISD::Tex3DFloatFloat: 203 case NVPTXISD::Tex3DFloatFloatLevel: 204 case NVPTXISD::Tex3DFloatFloatGrad: 205 case NVPTXISD::Tex3DS32S32: 206 case NVPTXISD::Tex3DS32Float: 207 case NVPTXISD::Tex3DS32FloatLevel: 208 case NVPTXISD::Tex3DS32FloatGrad: 209 case NVPTXISD::Tex3DU32S32: 210 case NVPTXISD::Tex3DU32Float: 211 case NVPTXISD::Tex3DU32FloatLevel: 212 case NVPTXISD::Tex3DU32FloatGrad: 213 case NVPTXISD::TexCubeFloatFloat: 214 case NVPTXISD::TexCubeFloatFloatLevel: 215 case NVPTXISD::TexCubeS32Float: 216 case NVPTXISD::TexCubeS32FloatLevel: 217 case NVPTXISD::TexCubeU32Float: 218 case NVPTXISD::TexCubeU32FloatLevel: 219 case NVPTXISD::TexCubeArrayFloatFloat: 220 case NVPTXISD::TexCubeArrayFloatFloatLevel: 221 case NVPTXISD::TexCubeArrayS32Float: 222 case NVPTXISD::TexCubeArrayS32FloatLevel: 223 case NVPTXISD::TexCubeArrayU32Float: 224 case NVPTXISD::TexCubeArrayU32FloatLevel: 225 case NVPTXISD::Tld4R2DFloatFloat: 226 case NVPTXISD::Tld4G2DFloatFloat: 227 case NVPTXISD::Tld4B2DFloatFloat: 228 case NVPTXISD::Tld4A2DFloatFloat: 229 case NVPTXISD::Tld4R2DS64Float: 230 case NVPTXISD::Tld4G2DS64Float: 231 case NVPTXISD::Tld4B2DS64Float: 232 case NVPTXISD::Tld4A2DS64Float: 233 case NVPTXISD::Tld4R2DU64Float: 234 case NVPTXISD::Tld4G2DU64Float: 235 case NVPTXISD::Tld4B2DU64Float: 236 case NVPTXISD::Tld4A2DU64Float: 237 case NVPTXISD::TexUnified1DFloatS32: 238 case NVPTXISD::TexUnified1DFloatFloat: 239 case NVPTXISD::TexUnified1DFloatFloatLevel: 240 case NVPTXISD::TexUnified1DFloatFloatGrad: 241 case NVPTXISD::TexUnified1DS32S32: 242 case NVPTXISD::TexUnified1DS32Float: 243 case NVPTXISD::TexUnified1DS32FloatLevel: 244 case NVPTXISD::TexUnified1DS32FloatGrad: 245 case NVPTXISD::TexUnified1DU32S32: 246 case NVPTXISD::TexUnified1DU32Float: 247 case NVPTXISD::TexUnified1DU32FloatLevel: 248 case NVPTXISD::TexUnified1DU32FloatGrad: 249 case NVPTXISD::TexUnified1DArrayFloatS32: 250 case NVPTXISD::TexUnified1DArrayFloatFloat: 251 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 252 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 253 case NVPTXISD::TexUnified1DArrayS32S32: 254 case NVPTXISD::TexUnified1DArrayS32Float: 255 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 256 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 257 case NVPTXISD::TexUnified1DArrayU32S32: 258 case NVPTXISD::TexUnified1DArrayU32Float: 259 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 260 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 261 case NVPTXISD::TexUnified2DFloatS32: 262 case NVPTXISD::TexUnified2DFloatFloat: 263 case NVPTXISD::TexUnified2DFloatFloatLevel: 264 case NVPTXISD::TexUnified2DFloatFloatGrad: 265 case NVPTXISD::TexUnified2DS32S32: 266 case NVPTXISD::TexUnified2DS32Float: 267 case NVPTXISD::TexUnified2DS32FloatLevel: 268 case NVPTXISD::TexUnified2DS32FloatGrad: 269 case NVPTXISD::TexUnified2DU32S32: 270 case NVPTXISD::TexUnified2DU32Float: 271 case NVPTXISD::TexUnified2DU32FloatLevel: 272 case NVPTXISD::TexUnified2DU32FloatGrad: 273 case NVPTXISD::TexUnified2DArrayFloatS32: 274 case NVPTXISD::TexUnified2DArrayFloatFloat: 275 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 276 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 277 case NVPTXISD::TexUnified2DArrayS32S32: 278 case NVPTXISD::TexUnified2DArrayS32Float: 279 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 280 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 281 case NVPTXISD::TexUnified2DArrayU32S32: 282 case NVPTXISD::TexUnified2DArrayU32Float: 283 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 284 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 285 case NVPTXISD::TexUnified3DFloatS32: 286 case NVPTXISD::TexUnified3DFloatFloat: 287 case NVPTXISD::TexUnified3DFloatFloatLevel: 288 case NVPTXISD::TexUnified3DFloatFloatGrad: 289 case NVPTXISD::TexUnified3DS32S32: 290 case NVPTXISD::TexUnified3DS32Float: 291 case NVPTXISD::TexUnified3DS32FloatLevel: 292 case NVPTXISD::TexUnified3DS32FloatGrad: 293 case NVPTXISD::TexUnified3DU32S32: 294 case NVPTXISD::TexUnified3DU32Float: 295 case NVPTXISD::TexUnified3DU32FloatLevel: 296 case NVPTXISD::TexUnified3DU32FloatGrad: 297 case NVPTXISD::TexUnifiedCubeFloatFloat: 298 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 299 case NVPTXISD::TexUnifiedCubeS32Float: 300 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 301 case NVPTXISD::TexUnifiedCubeU32Float: 302 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 303 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 304 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 305 case NVPTXISD::TexUnifiedCubeArrayS32Float: 306 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 307 case NVPTXISD::TexUnifiedCubeArrayU32Float: 308 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 309 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 310 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 311 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 312 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 313 case NVPTXISD::Tld4UnifiedR2DS64Float: 314 case NVPTXISD::Tld4UnifiedG2DS64Float: 315 case NVPTXISD::Tld4UnifiedB2DS64Float: 316 case NVPTXISD::Tld4UnifiedA2DS64Float: 317 case NVPTXISD::Tld4UnifiedR2DU64Float: 318 case NVPTXISD::Tld4UnifiedG2DU64Float: 319 case NVPTXISD::Tld4UnifiedB2DU64Float: 320 case NVPTXISD::Tld4UnifiedA2DU64Float: 321 if (tryTextureIntrinsic(N)) 322 return; 323 break; 324 case NVPTXISD::Suld1DI8Clamp: 325 case NVPTXISD::Suld1DI16Clamp: 326 case NVPTXISD::Suld1DI32Clamp: 327 case NVPTXISD::Suld1DI64Clamp: 328 case NVPTXISD::Suld1DV2I8Clamp: 329 case NVPTXISD::Suld1DV2I16Clamp: 330 case NVPTXISD::Suld1DV2I32Clamp: 331 case NVPTXISD::Suld1DV2I64Clamp: 332 case NVPTXISD::Suld1DV4I8Clamp: 333 case NVPTXISD::Suld1DV4I16Clamp: 334 case NVPTXISD::Suld1DV4I32Clamp: 335 case NVPTXISD::Suld1DArrayI8Clamp: 336 case NVPTXISD::Suld1DArrayI16Clamp: 337 case NVPTXISD::Suld1DArrayI32Clamp: 338 case NVPTXISD::Suld1DArrayI64Clamp: 339 case NVPTXISD::Suld1DArrayV2I8Clamp: 340 case NVPTXISD::Suld1DArrayV2I16Clamp: 341 case NVPTXISD::Suld1DArrayV2I32Clamp: 342 case NVPTXISD::Suld1DArrayV2I64Clamp: 343 case NVPTXISD::Suld1DArrayV4I8Clamp: 344 case NVPTXISD::Suld1DArrayV4I16Clamp: 345 case NVPTXISD::Suld1DArrayV4I32Clamp: 346 case NVPTXISD::Suld2DI8Clamp: 347 case NVPTXISD::Suld2DI16Clamp: 348 case NVPTXISD::Suld2DI32Clamp: 349 case NVPTXISD::Suld2DI64Clamp: 350 case NVPTXISD::Suld2DV2I8Clamp: 351 case NVPTXISD::Suld2DV2I16Clamp: 352 case NVPTXISD::Suld2DV2I32Clamp: 353 case NVPTXISD::Suld2DV2I64Clamp: 354 case NVPTXISD::Suld2DV4I8Clamp: 355 case NVPTXISD::Suld2DV4I16Clamp: 356 case NVPTXISD::Suld2DV4I32Clamp: 357 case NVPTXISD::Suld2DArrayI8Clamp: 358 case NVPTXISD::Suld2DArrayI16Clamp: 359 case NVPTXISD::Suld2DArrayI32Clamp: 360 case NVPTXISD::Suld2DArrayI64Clamp: 361 case NVPTXISD::Suld2DArrayV2I8Clamp: 362 case NVPTXISD::Suld2DArrayV2I16Clamp: 363 case NVPTXISD::Suld2DArrayV2I32Clamp: 364 case NVPTXISD::Suld2DArrayV2I64Clamp: 365 case NVPTXISD::Suld2DArrayV4I8Clamp: 366 case NVPTXISD::Suld2DArrayV4I16Clamp: 367 case NVPTXISD::Suld2DArrayV4I32Clamp: 368 case NVPTXISD::Suld3DI8Clamp: 369 case NVPTXISD::Suld3DI16Clamp: 370 case NVPTXISD::Suld3DI32Clamp: 371 case NVPTXISD::Suld3DI64Clamp: 372 case NVPTXISD::Suld3DV2I8Clamp: 373 case NVPTXISD::Suld3DV2I16Clamp: 374 case NVPTXISD::Suld3DV2I32Clamp: 375 case NVPTXISD::Suld3DV2I64Clamp: 376 case NVPTXISD::Suld3DV4I8Clamp: 377 case NVPTXISD::Suld3DV4I16Clamp: 378 case NVPTXISD::Suld3DV4I32Clamp: 379 case NVPTXISD::Suld1DI8Trap: 380 case NVPTXISD::Suld1DI16Trap: 381 case NVPTXISD::Suld1DI32Trap: 382 case NVPTXISD::Suld1DI64Trap: 383 case NVPTXISD::Suld1DV2I8Trap: 384 case NVPTXISD::Suld1DV2I16Trap: 385 case NVPTXISD::Suld1DV2I32Trap: 386 case NVPTXISD::Suld1DV2I64Trap: 387 case NVPTXISD::Suld1DV4I8Trap: 388 case NVPTXISD::Suld1DV4I16Trap: 389 case NVPTXISD::Suld1DV4I32Trap: 390 case NVPTXISD::Suld1DArrayI8Trap: 391 case NVPTXISD::Suld1DArrayI16Trap: 392 case NVPTXISD::Suld1DArrayI32Trap: 393 case NVPTXISD::Suld1DArrayI64Trap: 394 case NVPTXISD::Suld1DArrayV2I8Trap: 395 case NVPTXISD::Suld1DArrayV2I16Trap: 396 case NVPTXISD::Suld1DArrayV2I32Trap: 397 case NVPTXISD::Suld1DArrayV2I64Trap: 398 case NVPTXISD::Suld1DArrayV4I8Trap: 399 case NVPTXISD::Suld1DArrayV4I16Trap: 400 case NVPTXISD::Suld1DArrayV4I32Trap: 401 case NVPTXISD::Suld2DI8Trap: 402 case NVPTXISD::Suld2DI16Trap: 403 case NVPTXISD::Suld2DI32Trap: 404 case NVPTXISD::Suld2DI64Trap: 405 case NVPTXISD::Suld2DV2I8Trap: 406 case NVPTXISD::Suld2DV2I16Trap: 407 case NVPTXISD::Suld2DV2I32Trap: 408 case NVPTXISD::Suld2DV2I64Trap: 409 case NVPTXISD::Suld2DV4I8Trap: 410 case NVPTXISD::Suld2DV4I16Trap: 411 case NVPTXISD::Suld2DV4I32Trap: 412 case NVPTXISD::Suld2DArrayI8Trap: 413 case NVPTXISD::Suld2DArrayI16Trap: 414 case NVPTXISD::Suld2DArrayI32Trap: 415 case NVPTXISD::Suld2DArrayI64Trap: 416 case NVPTXISD::Suld2DArrayV2I8Trap: 417 case NVPTXISD::Suld2DArrayV2I16Trap: 418 case NVPTXISD::Suld2DArrayV2I32Trap: 419 case NVPTXISD::Suld2DArrayV2I64Trap: 420 case NVPTXISD::Suld2DArrayV4I8Trap: 421 case NVPTXISD::Suld2DArrayV4I16Trap: 422 case NVPTXISD::Suld2DArrayV4I32Trap: 423 case NVPTXISD::Suld3DI8Trap: 424 case NVPTXISD::Suld3DI16Trap: 425 case NVPTXISD::Suld3DI32Trap: 426 case NVPTXISD::Suld3DI64Trap: 427 case NVPTXISD::Suld3DV2I8Trap: 428 case NVPTXISD::Suld3DV2I16Trap: 429 case NVPTXISD::Suld3DV2I32Trap: 430 case NVPTXISD::Suld3DV2I64Trap: 431 case NVPTXISD::Suld3DV4I8Trap: 432 case NVPTXISD::Suld3DV4I16Trap: 433 case NVPTXISD::Suld3DV4I32Trap: 434 case NVPTXISD::Suld1DI8Zero: 435 case NVPTXISD::Suld1DI16Zero: 436 case NVPTXISD::Suld1DI32Zero: 437 case NVPTXISD::Suld1DI64Zero: 438 case NVPTXISD::Suld1DV2I8Zero: 439 case NVPTXISD::Suld1DV2I16Zero: 440 case NVPTXISD::Suld1DV2I32Zero: 441 case NVPTXISD::Suld1DV2I64Zero: 442 case NVPTXISD::Suld1DV4I8Zero: 443 case NVPTXISD::Suld1DV4I16Zero: 444 case NVPTXISD::Suld1DV4I32Zero: 445 case NVPTXISD::Suld1DArrayI8Zero: 446 case NVPTXISD::Suld1DArrayI16Zero: 447 case NVPTXISD::Suld1DArrayI32Zero: 448 case NVPTXISD::Suld1DArrayI64Zero: 449 case NVPTXISD::Suld1DArrayV2I8Zero: 450 case NVPTXISD::Suld1DArrayV2I16Zero: 451 case NVPTXISD::Suld1DArrayV2I32Zero: 452 case NVPTXISD::Suld1DArrayV2I64Zero: 453 case NVPTXISD::Suld1DArrayV4I8Zero: 454 case NVPTXISD::Suld1DArrayV4I16Zero: 455 case NVPTXISD::Suld1DArrayV4I32Zero: 456 case NVPTXISD::Suld2DI8Zero: 457 case NVPTXISD::Suld2DI16Zero: 458 case NVPTXISD::Suld2DI32Zero: 459 case NVPTXISD::Suld2DI64Zero: 460 case NVPTXISD::Suld2DV2I8Zero: 461 case NVPTXISD::Suld2DV2I16Zero: 462 case NVPTXISD::Suld2DV2I32Zero: 463 case NVPTXISD::Suld2DV2I64Zero: 464 case NVPTXISD::Suld2DV4I8Zero: 465 case NVPTXISD::Suld2DV4I16Zero: 466 case NVPTXISD::Suld2DV4I32Zero: 467 case NVPTXISD::Suld2DArrayI8Zero: 468 case NVPTXISD::Suld2DArrayI16Zero: 469 case NVPTXISD::Suld2DArrayI32Zero: 470 case NVPTXISD::Suld2DArrayI64Zero: 471 case NVPTXISD::Suld2DArrayV2I8Zero: 472 case NVPTXISD::Suld2DArrayV2I16Zero: 473 case NVPTXISD::Suld2DArrayV2I32Zero: 474 case NVPTXISD::Suld2DArrayV2I64Zero: 475 case NVPTXISD::Suld2DArrayV4I8Zero: 476 case NVPTXISD::Suld2DArrayV4I16Zero: 477 case NVPTXISD::Suld2DArrayV4I32Zero: 478 case NVPTXISD::Suld3DI8Zero: 479 case NVPTXISD::Suld3DI16Zero: 480 case NVPTXISD::Suld3DI32Zero: 481 case NVPTXISD::Suld3DI64Zero: 482 case NVPTXISD::Suld3DV2I8Zero: 483 case NVPTXISD::Suld3DV2I16Zero: 484 case NVPTXISD::Suld3DV2I32Zero: 485 case NVPTXISD::Suld3DV2I64Zero: 486 case NVPTXISD::Suld3DV4I8Zero: 487 case NVPTXISD::Suld3DV4I16Zero: 488 case NVPTXISD::Suld3DV4I32Zero: 489 if (trySurfaceIntrinsic(N)) 490 return; 491 break; 492 case ISD::AND: 493 case ISD::SRA: 494 case ISD::SRL: 495 // Try to select BFE 496 if (tryBFE(N)) 497 return; 498 break; 499 case ISD::ADDRSPACECAST: 500 SelectAddrSpaceCast(N); 501 return; 502 case ISD::ConstantFP: 503 if (tryConstantFP(N)) 504 return; 505 break; 506 default: 507 break; 508 } 509 SelectCode(N); 510 } 511 512 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { 513 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 514 switch (IID) { 515 default: 516 return false; 517 case Intrinsic::nvvm_ldg_global_f: 518 case Intrinsic::nvvm_ldg_global_i: 519 case Intrinsic::nvvm_ldg_global_p: 520 case Intrinsic::nvvm_ldu_global_f: 521 case Intrinsic::nvvm_ldu_global_i: 522 case Intrinsic::nvvm_ldu_global_p: 523 return tryLDGLDU(N); 524 } 525 } 526 527 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we 528 // have to load them into an .(b)f16 register first. 529 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { 530 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16) 531 return false; 532 SDValue Val = CurDAG->getTargetConstantFP( 533 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0)); 534 SDNode *LoadConstF16 = CurDAG->getMachineNode( 535 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 536 : NVPTX::LOAD_CONST_BF16), 537 SDLoc(N), N->getValueType(0), Val); 538 ReplaceNode(N, LoadConstF16); 539 return true; 540 } 541 542 // Map ISD:CONDCODE value to appropriate CmpMode expected by 543 // NVPTXInstPrinter::printCmpMode() 544 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { 545 using NVPTX::PTXCmpMode::CmpMode; 546 unsigned PTXCmpMode = [](ISD::CondCode CC) { 547 switch (CC) { 548 default: 549 llvm_unreachable("Unexpected condition code."); 550 case ISD::SETOEQ: 551 return CmpMode::EQ; 552 case ISD::SETOGT: 553 return CmpMode::GT; 554 case ISD::SETOGE: 555 return CmpMode::GE; 556 case ISD::SETOLT: 557 return CmpMode::LT; 558 case ISD::SETOLE: 559 return CmpMode::LE; 560 case ISD::SETONE: 561 return CmpMode::NE; 562 case ISD::SETO: 563 return CmpMode::NUM; 564 case ISD::SETUO: 565 return CmpMode::NotANumber; 566 case ISD::SETUEQ: 567 return CmpMode::EQU; 568 case ISD::SETUGT: 569 return CmpMode::GTU; 570 case ISD::SETUGE: 571 return CmpMode::GEU; 572 case ISD::SETULT: 573 return CmpMode::LTU; 574 case ISD::SETULE: 575 return CmpMode::LEU; 576 case ISD::SETUNE: 577 return CmpMode::NEU; 578 case ISD::SETEQ: 579 return CmpMode::EQ; 580 case ISD::SETGT: 581 return CmpMode::GT; 582 case ISD::SETGE: 583 return CmpMode::GE; 584 case ISD::SETLT: 585 return CmpMode::LT; 586 case ISD::SETLE: 587 return CmpMode::LE; 588 case ISD::SETNE: 589 return CmpMode::NE; 590 } 591 }(CondCode.get()); 592 593 if (FTZ) 594 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; 595 596 return PTXCmpMode; 597 } 598 599 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { 600 unsigned PTXCmpMode = 601 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 602 SDLoc DL(N); 603 SDNode *SetP = CurDAG->getMachineNode( 604 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 605 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 606 ReplaceNode(N, SetP); 607 return true; 608 } 609 610 // Find all instances of extract_vector_elt that use this v2f16 vector 611 // and coalesce them into a scattering move instruction. 612 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { 613 SDValue Vector = N->getOperand(0); 614 615 // We only care about f16x2 as it's the only real vector type we 616 // need to deal with. 617 MVT VT = Vector.getSimpleValueType(); 618 if (!(VT == MVT::v2f16 || VT == MVT::v2bf16)) 619 return false; 620 // Find and record all uses of this vector that extract element 0 or 1. 621 SmallVector<SDNode *, 4> E0, E1; 622 for (auto *U : Vector.getNode()->uses()) { 623 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 624 continue; 625 if (U->getOperand(0) != Vector) 626 continue; 627 if (const ConstantSDNode *IdxConst = 628 dyn_cast<ConstantSDNode>(U->getOperand(1))) { 629 if (IdxConst->getZExtValue() == 0) 630 E0.push_back(U); 631 else if (IdxConst->getZExtValue() == 1) 632 E1.push_back(U); 633 else 634 llvm_unreachable("Invalid vector index."); 635 } 636 } 637 638 // There's no point scattering f16x2 if we only ever access one 639 // element of it. 640 if (E0.empty() || E1.empty()) 641 return false; 642 643 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) 644 // into f16,f16 SplitF16x2(V) 645 MVT EltVT = VT.getVectorElementType(); 646 SDNode *ScatterOp = 647 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); 648 for (auto *Node : E0) 649 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); 650 for (auto *Node : E1) 651 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); 652 653 return true; 654 } 655 656 static unsigned int getCodeAddrSpace(MemSDNode *N) { 657 const Value *Src = N->getMemOperand()->getValue(); 658 659 if (!Src) 660 return NVPTX::PTXLdStInstCode::GENERIC; 661 662 if (auto *PT = dyn_cast<PointerType>(Src->getType())) { 663 switch (PT->getAddressSpace()) { 664 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; 665 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; 666 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; 667 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; 668 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; 669 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; 670 default: break; 671 } 672 } 673 return NVPTX::PTXLdStInstCode::GENERIC; 674 } 675 676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, 677 unsigned CodeAddrSpace, MachineFunction *F) { 678 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address 679 // space. 680 // 681 // We have two ways of identifying invariant loads: Loads may be explicitly 682 // marked as invariant, or we may infer them to be invariant. 683 // 684 // We currently infer invariance for loads from 685 // - constant global variables, and 686 // - kernel function pointer params that are noalias (i.e. __restrict) and 687 // never written to. 688 // 689 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally 690 // not during the SelectionDAG phase). 691 // 692 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for 693 // explicitly invariant loads because these are how clang tells us to use ldg 694 // when the user uses a builtin. 695 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) 696 return false; 697 698 if (N->isInvariant()) 699 return true; 700 701 bool IsKernelFn = isKernelFunction(F->getFunction()); 702 703 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly 704 // because the former looks through phi nodes while the latter does not. We 705 // need to look through phi nodes to handle pointer induction variables. 706 SmallVector<const Value *, 8> Objs; 707 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); 708 709 return all_of(Objs, [&](const Value *V) { 710 if (auto *A = dyn_cast<const Argument>(V)) 711 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); 712 if (auto *GV = dyn_cast<const GlobalVariable>(V)) 713 return GV->isConstant(); 714 return false; 715 }); 716 } 717 718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { 719 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 720 switch (IID) { 721 default: 722 return false; 723 case Intrinsic::nvvm_texsurf_handle_internal: 724 SelectTexSurfHandle(N); 725 return true; 726 } 727 } 728 729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { 730 // Op 0 is the intrinsic ID 731 SDValue Wrapper = N->getOperand(1); 732 SDValue GlobalVal = Wrapper.getOperand(0); 733 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), 734 MVT::i64, GlobalVal)); 735 } 736 737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { 738 SDValue Src = N->getOperand(0); 739 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); 740 unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); 741 unsigned DstAddrSpace = CastN->getDestAddressSpace(); 742 assert(SrcAddrSpace != DstAddrSpace && 743 "addrspacecast must be between different address spaces"); 744 745 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { 746 // Specific to generic 747 unsigned Opc; 748 switch (SrcAddrSpace) { 749 default: report_fatal_error("Bad address space in addrspacecast"); 750 case ADDRESS_SPACE_GLOBAL: 751 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes; 752 break; 753 case ADDRESS_SPACE_SHARED: 754 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432 755 : NVPTX::cvta_shared_yes_64) 756 : NVPTX::cvta_shared_yes; 757 break; 758 case ADDRESS_SPACE_CONST: 759 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432 760 : NVPTX::cvta_const_yes_64) 761 : NVPTX::cvta_const_yes; 762 break; 763 case ADDRESS_SPACE_LOCAL: 764 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432 765 : NVPTX::cvta_local_yes_64) 766 : NVPTX::cvta_local_yes; 767 break; 768 } 769 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 770 Src)); 771 return; 772 } else { 773 // Generic to specific 774 if (SrcAddrSpace != 0) 775 report_fatal_error("Cannot cast between two non-generic address spaces"); 776 unsigned Opc; 777 switch (DstAddrSpace) { 778 default: report_fatal_error("Bad address space in addrspacecast"); 779 case ADDRESS_SPACE_GLOBAL: 780 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64 781 : NVPTX::cvta_to_global_yes; 782 break; 783 case ADDRESS_SPACE_SHARED: 784 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264 785 : NVPTX::cvta_to_shared_yes_64) 786 : NVPTX::cvta_to_shared_yes; 787 break; 788 case ADDRESS_SPACE_CONST: 789 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264 790 : NVPTX::cvta_to_const_yes_64) 791 : NVPTX::cvta_to_const_yes; 792 break; 793 case ADDRESS_SPACE_LOCAL: 794 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264 795 : NVPTX::cvta_to_local_yes_64) 796 : NVPTX::cvta_to_local_yes; 797 break; 798 case ADDRESS_SPACE_PARAM: 799 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 800 : NVPTX::nvvm_ptr_gen_to_param; 801 break; 802 } 803 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 804 Src)); 805 return; 806 } 807 } 808 809 // Helper function template to reduce amount of boilerplate code for 810 // opcode selection. 811 static std::optional<unsigned> 812 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, 813 unsigned Opcode_i16, unsigned Opcode_i32, 814 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, 815 std::optional<unsigned> Opcode_f64) { 816 switch (VT) { 817 case MVT::i1: 818 case MVT::i8: 819 return Opcode_i8; 820 case MVT::i16: 821 return Opcode_i16; 822 case MVT::i32: 823 return Opcode_i32; 824 case MVT::i64: 825 return Opcode_i64; 826 case MVT::f16: 827 case MVT::bf16: 828 return Opcode_i16; 829 case MVT::v2f16: 830 case MVT::v2bf16: 831 return Opcode_i32; 832 case MVT::f32: 833 return Opcode_f32; 834 case MVT::f64: 835 return Opcode_f64; 836 default: 837 return std::nullopt; 838 } 839 } 840 841 static int getLdStRegType(EVT VT) { 842 if (VT.isFloatingPoint()) 843 switch (VT.getSimpleVT().SimpleTy) { 844 case MVT::f16: 845 case MVT::bf16: 846 case MVT::v2f16: 847 case MVT::v2bf16: 848 return NVPTX::PTXLdStInstCode::Untyped; 849 default: 850 return NVPTX::PTXLdStInstCode::Float; 851 } 852 else 853 return NVPTX::PTXLdStInstCode::Unsigned; 854 } 855 856 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { 857 SDLoc dl(N); 858 MemSDNode *LD = cast<MemSDNode>(N); 859 assert(LD->readMem() && "Expected load"); 860 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); 861 EVT LoadedVT = LD->getMemoryVT(); 862 SDNode *NVPTXLD = nullptr; 863 864 // do not support pre/post inc/dec 865 if (PlainLoad && PlainLoad->isIndexed()) 866 return false; 867 868 if (!LoadedVT.isSimple()) 869 return false; 870 871 AtomicOrdering Ordering = LD->getSuccessOrdering(); 872 // In order to lower atomic loads with stronger guarantees we would need to 873 // use load.acquire or insert fences. However these features were only added 874 // with PTX ISA 6.0 / sm_70. 875 // TODO: Check if we can actually use the new instructions and implement them. 876 if (isStrongerThanMonotonic(Ordering)) 877 return false; 878 879 // Address Space Setting 880 unsigned int CodeAddrSpace = getCodeAddrSpace(LD); 881 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { 882 return tryLDGLDU(N); 883 } 884 885 unsigned int PointerSize = 886 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); 887 888 // Volatile Setting 889 // - .volatile is only available for .global and .shared 890 // - .volatile has the same memory synchronization semantics as .relaxed.sys 891 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; 892 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 893 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 894 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 895 isVolatile = false; 896 897 // Type Setting: fromType + fromTypeWidth 898 // 899 // Sign : ISD::SEXTLOAD 900 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 901 // type is integer 902 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 903 MVT SimpleVT = LoadedVT.getSimpleVT(); 904 MVT ScalarVT = SimpleVT.getScalarType(); 905 // Read at least 8 bits (predicates are stored as 8-bit values) 906 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 907 unsigned int fromType; 908 909 // Vector Setting 910 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 911 if (SimpleVT.isVector()) { 912 assert((LoadedVT == MVT::v2f16 || LoadedVT == MVT::v2bf16) && 913 "Unexpected vector type"); 914 // v2f16/v2bf16 is loaded using ld.b32 915 fromTypeWidth = 32; 916 } 917 918 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) 919 fromType = NVPTX::PTXLdStInstCode::Signed; 920 else 921 fromType = getLdStRegType(ScalarVT); 922 923 // Create the machine instruction DAG 924 SDValue Chain = N->getOperand(0); 925 SDValue N1 = N->getOperand(1); 926 SDValue Addr; 927 SDValue Offset, Base; 928 std::optional<unsigned> Opcode; 929 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; 930 931 if (SelectDirectAddr(N1, Addr)) { 932 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, 933 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, 934 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); 935 if (!Opcode) 936 return false; 937 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 938 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 939 getI32Imm(fromTypeWidth, dl), Addr, Chain }; 940 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 941 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) 942 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { 943 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, 944 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, 945 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); 946 if (!Opcode) 947 return false; 948 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 949 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 950 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 951 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 952 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) 953 : SelectADDRri(N1.getNode(), N1, Base, Offset)) { 954 if (PointerSize == 64) 955 Opcode = 956 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, 957 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, 958 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); 959 else 960 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, 961 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari, 962 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); 963 if (!Opcode) 964 return false; 965 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 966 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 967 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 968 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 969 } else { 970 if (PointerSize == 64) 971 Opcode = 972 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, 973 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, 974 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64); 975 else 976 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, 977 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg, 978 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); 979 if (!Opcode) 980 return false; 981 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 982 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 983 getI32Imm(fromTypeWidth, dl), N1, Chain }; 984 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 985 } 986 987 if (!NVPTXLD) 988 return false; 989 990 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 991 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef}); 992 993 ReplaceNode(N, NVPTXLD); 994 return true; 995 } 996 997 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { 998 999 SDValue Chain = N->getOperand(0); 1000 SDValue Op1 = N->getOperand(1); 1001 SDValue Addr, Offset, Base; 1002 std::optional<unsigned> Opcode; 1003 SDLoc DL(N); 1004 SDNode *LD; 1005 MemSDNode *MemSD = cast<MemSDNode>(N); 1006 EVT LoadedVT = MemSD->getMemoryVT(); 1007 1008 if (!LoadedVT.isSimple()) 1009 return false; 1010 1011 // Address Space Setting 1012 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); 1013 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { 1014 return tryLDGLDU(N); 1015 } 1016 1017 unsigned int PointerSize = 1018 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1019 1020 // Volatile Setting 1021 // - .volatile is only availalble for .global and .shared 1022 bool IsVolatile = MemSD->isVolatile(); 1023 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1024 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1025 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1026 IsVolatile = false; 1027 1028 // Vector Setting 1029 MVT SimpleVT = LoadedVT.getSimpleVT(); 1030 1031 // Type Setting: fromType + fromTypeWidth 1032 // 1033 // Sign : ISD::SEXTLOAD 1034 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 1035 // type is integer 1036 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 1037 MVT ScalarVT = SimpleVT.getScalarType(); 1038 // Read at least 8 bits (predicates are stored as 8-bit values) 1039 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 1040 unsigned int FromType; 1041 // The last operand holds the original LoadSDNode::getExtensionType() value 1042 unsigned ExtensionType = cast<ConstantSDNode>( 1043 N->getOperand(N->getNumOperands() - 1))->getZExtValue(); 1044 if (ExtensionType == ISD::SEXTLOAD) 1045 FromType = NVPTX::PTXLdStInstCode::Signed; 1046 else 1047 FromType = getLdStRegType(ScalarVT); 1048 1049 unsigned VecType; 1050 1051 switch (N->getOpcode()) { 1052 case NVPTXISD::LoadV2: 1053 VecType = NVPTX::PTXLdStInstCode::V2; 1054 break; 1055 case NVPTXISD::LoadV4: 1056 VecType = NVPTX::PTXLdStInstCode::V4; 1057 break; 1058 default: 1059 return false; 1060 } 1061 1062 EVT EltVT = N->getValueType(0); 1063 1064 // v8f16 is a special case. PTX doesn't have ld.v8.f16 1065 // instruction. Instead, we split the vector into v2f16 chunks and 1066 // load them with ld.v4.b32. 1067 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) { 1068 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); 1069 EltVT = MVT::i32; 1070 FromType = NVPTX::PTXLdStInstCode::Untyped; 1071 FromTypeWidth = 32; 1072 } 1073 1074 if (SelectDirectAddr(Op1, Addr)) { 1075 switch (N->getOpcode()) { 1076 default: 1077 return false; 1078 case NVPTXISD::LoadV2: 1079 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1080 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, 1081 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, 1082 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); 1083 break; 1084 case NVPTXISD::LoadV4: 1085 Opcode = 1086 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, 1087 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, 1088 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt); 1089 break; 1090 } 1091 if (!Opcode) 1092 return false; 1093 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1094 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1095 getI32Imm(FromTypeWidth, DL), Addr, Chain }; 1096 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1097 } else if (PointerSize == 64 1098 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) 1099 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { 1100 switch (N->getOpcode()) { 1101 default: 1102 return false; 1103 case NVPTXISD::LoadV2: 1104 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1105 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, 1106 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, 1107 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); 1108 break; 1109 case NVPTXISD::LoadV4: 1110 Opcode = 1111 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, 1112 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, 1113 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt); 1114 break; 1115 } 1116 if (!Opcode) 1117 return false; 1118 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1119 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1120 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1121 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1122 } else if (PointerSize == 64 1123 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1124 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1125 if (PointerSize == 64) { 1126 switch (N->getOpcode()) { 1127 default: 1128 return false; 1129 case NVPTXISD::LoadV2: 1130 Opcode = 1131 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1132 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64, 1133 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64, 1134 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64); 1135 break; 1136 case NVPTXISD::LoadV4: 1137 Opcode = pickOpcodeForVT( 1138 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, 1139 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt, 1140 NVPTX::LDV_f32_v4_ari_64, std::nullopt); 1141 break; 1142 } 1143 } else { 1144 switch (N->getOpcode()) { 1145 default: 1146 return false; 1147 case NVPTXISD::LoadV2: 1148 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1149 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, 1150 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, 1151 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); 1152 break; 1153 case NVPTXISD::LoadV4: 1154 Opcode = 1155 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, 1156 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, 1157 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt); 1158 break; 1159 } 1160 } 1161 if (!Opcode) 1162 return false; 1163 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1164 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1165 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1166 1167 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1168 } else { 1169 if (PointerSize == 64) { 1170 switch (N->getOpcode()) { 1171 default: 1172 return false; 1173 case NVPTXISD::LoadV2: 1174 Opcode = pickOpcodeForVT( 1175 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, 1176 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, 1177 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, 1178 NVPTX::LDV_f64_v2_areg_64); 1179 break; 1180 case NVPTXISD::LoadV4: 1181 Opcode = pickOpcodeForVT( 1182 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, 1183 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt, 1184 NVPTX::LDV_f32_v4_areg_64, std::nullopt); 1185 break; 1186 } 1187 } else { 1188 switch (N->getOpcode()) { 1189 default: 1190 return false; 1191 case NVPTXISD::LoadV2: 1192 Opcode = 1193 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, 1194 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, 1195 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg, 1196 NVPTX::LDV_f64_v2_areg); 1197 break; 1198 case NVPTXISD::LoadV4: 1199 Opcode = 1200 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, 1201 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, 1202 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt); 1203 break; 1204 } 1205 } 1206 if (!Opcode) 1207 return false; 1208 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1209 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1210 getI32Imm(FromTypeWidth, DL), Op1, Chain }; 1211 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1212 } 1213 1214 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1215 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1216 1217 ReplaceNode(N, LD); 1218 return true; 1219 } 1220 1221 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { 1222 1223 SDValue Chain = N->getOperand(0); 1224 SDValue Op1; 1225 MemSDNode *Mem; 1226 bool IsLDG = true; 1227 1228 // If this is an LDG intrinsic, the address is the third operand. If its an 1229 // LDG/LDU SD node (from custom vector handling), then its the second operand 1230 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1231 Op1 = N->getOperand(2); 1232 Mem = cast<MemIntrinsicSDNode>(N); 1233 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 1234 switch (IID) { 1235 default: 1236 return false; 1237 case Intrinsic::nvvm_ldg_global_f: 1238 case Intrinsic::nvvm_ldg_global_i: 1239 case Intrinsic::nvvm_ldg_global_p: 1240 IsLDG = true; 1241 break; 1242 case Intrinsic::nvvm_ldu_global_f: 1243 case Intrinsic::nvvm_ldu_global_i: 1244 case Intrinsic::nvvm_ldu_global_p: 1245 IsLDG = false; 1246 break; 1247 } 1248 } else { 1249 Op1 = N->getOperand(1); 1250 Mem = cast<MemSDNode>(N); 1251 } 1252 1253 std::optional<unsigned> Opcode; 1254 SDLoc DL(N); 1255 SDNode *LD; 1256 SDValue Base, Offset, Addr; 1257 1258 EVT EltVT = Mem->getMemoryVT(); 1259 unsigned NumElts = 1; 1260 if (EltVT.isVector()) { 1261 NumElts = EltVT.getVectorNumElements(); 1262 EltVT = EltVT.getVectorElementType(); 1263 // vectors of f16 are loaded/stored as multiples of v2f16 elements. 1264 if ((EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) || 1265 (EltVT == MVT::bf16 && N->getValueType(0) == MVT::v2bf16)) { 1266 assert(NumElts % 2 == 0 && "Vector must have even number of elements"); 1267 EltVT = N->getValueType(0); 1268 NumElts /= 2; 1269 } 1270 } 1271 1272 // Build the "promoted" result VTList for the load. If we are really loading 1273 // i8s, then the return type will be promoted to i16 since we do not expose 1274 // 8-bit registers in NVPTX. 1275 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; 1276 SmallVector<EVT, 5> InstVTs; 1277 for (unsigned i = 0; i != NumElts; ++i) { 1278 InstVTs.push_back(NodeVT); 1279 } 1280 InstVTs.push_back(MVT::Other); 1281 SDVTList InstVTList = CurDAG->getVTList(InstVTs); 1282 1283 if (SelectDirectAddr(Op1, Addr)) { 1284 switch (N->getOpcode()) { 1285 default: 1286 return false; 1287 case ISD::LOAD: 1288 case ISD::INTRINSIC_W_CHAIN: 1289 if (IsLDG) 1290 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1291 NVPTX::INT_PTX_LDG_GLOBAL_i8avar, 1292 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, 1293 NVPTX::INT_PTX_LDG_GLOBAL_i32avar, 1294 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, 1295 NVPTX::INT_PTX_LDG_GLOBAL_f32avar, 1296 NVPTX::INT_PTX_LDG_GLOBAL_f64avar); 1297 else 1298 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1299 NVPTX::INT_PTX_LDU_GLOBAL_i8avar, 1300 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, 1301 NVPTX::INT_PTX_LDU_GLOBAL_i32avar, 1302 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, 1303 NVPTX::INT_PTX_LDU_GLOBAL_f32avar, 1304 NVPTX::INT_PTX_LDU_GLOBAL_f64avar); 1305 break; 1306 case NVPTXISD::LoadV2: 1307 case NVPTXISD::LDGV2: 1308 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1309 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, 1310 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, 1311 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, 1312 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, 1313 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, 1314 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); 1315 break; 1316 case NVPTXISD::LDUV2: 1317 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1318 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, 1319 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, 1320 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, 1321 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, 1322 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, 1323 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); 1324 break; 1325 case NVPTXISD::LoadV4: 1326 case NVPTXISD::LDGV4: 1327 Opcode = pickOpcodeForVT( 1328 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, 1329 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, 1330 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt, 1331 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt); 1332 break; 1333 case NVPTXISD::LDUV4: 1334 Opcode = pickOpcodeForVT( 1335 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, 1336 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, 1337 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt, 1338 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt); 1339 break; 1340 } 1341 if (!Opcode) 1342 return false; 1343 SDValue Ops[] = { Addr, Chain }; 1344 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1345 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1346 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1347 if (TM.is64Bit()) { 1348 switch (N->getOpcode()) { 1349 default: 1350 return false; 1351 case ISD::LOAD: 1352 case ISD::INTRINSIC_W_CHAIN: 1353 if (IsLDG) 1354 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1355 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, 1356 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, 1357 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, 1358 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, 1359 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, 1360 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); 1361 else 1362 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1363 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, 1364 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, 1365 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, 1366 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, 1367 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, 1368 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); 1369 break; 1370 case NVPTXISD::LoadV2: 1371 case NVPTXISD::LDGV2: 1372 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1373 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, 1374 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, 1375 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, 1376 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, 1377 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, 1378 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); 1379 break; 1380 case NVPTXISD::LDUV2: 1381 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1382 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, 1383 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, 1384 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, 1385 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, 1386 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, 1387 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); 1388 break; 1389 case NVPTXISD::LoadV4: 1390 case NVPTXISD::LDGV4: 1391 Opcode = pickOpcodeForVT( 1392 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, 1393 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, 1394 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt, 1395 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt); 1396 break; 1397 case NVPTXISD::LDUV4: 1398 Opcode = pickOpcodeForVT( 1399 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, 1400 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, 1401 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt, 1402 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt); 1403 break; 1404 } 1405 } else { 1406 switch (N->getOpcode()) { 1407 default: 1408 return false; 1409 case ISD::LOAD: 1410 case ISD::INTRINSIC_W_CHAIN: 1411 if (IsLDG) 1412 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1413 NVPTX::INT_PTX_LDG_GLOBAL_i8ari, 1414 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, 1415 NVPTX::INT_PTX_LDG_GLOBAL_i32ari, 1416 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, 1417 NVPTX::INT_PTX_LDG_GLOBAL_f32ari, 1418 NVPTX::INT_PTX_LDG_GLOBAL_f64ari); 1419 else 1420 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1421 NVPTX::INT_PTX_LDU_GLOBAL_i8ari, 1422 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, 1423 NVPTX::INT_PTX_LDU_GLOBAL_i32ari, 1424 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, 1425 NVPTX::INT_PTX_LDU_GLOBAL_f32ari, 1426 NVPTX::INT_PTX_LDU_GLOBAL_f64ari); 1427 break; 1428 case NVPTXISD::LoadV2: 1429 case NVPTXISD::LDGV2: 1430 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1431 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, 1432 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, 1433 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, 1434 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, 1435 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, 1436 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); 1437 break; 1438 case NVPTXISD::LDUV2: 1439 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1440 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, 1441 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, 1442 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, 1443 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, 1444 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, 1445 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); 1446 break; 1447 case NVPTXISD::LoadV4: 1448 case NVPTXISD::LDGV4: 1449 Opcode = pickOpcodeForVT( 1450 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, 1451 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, 1452 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt, 1453 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt); 1454 break; 1455 case NVPTXISD::LDUV4: 1456 Opcode = pickOpcodeForVT( 1457 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, 1458 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, 1459 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt, 1460 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt); 1461 break; 1462 } 1463 } 1464 if (!Opcode) 1465 return false; 1466 SDValue Ops[] = {Base, Offset, Chain}; 1467 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1468 } else { 1469 if (TM.is64Bit()) { 1470 switch (N->getOpcode()) { 1471 default: 1472 return false; 1473 case ISD::LOAD: 1474 case ISD::INTRINSIC_W_CHAIN: 1475 if (IsLDG) 1476 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1477 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, 1478 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, 1479 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, 1480 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, 1481 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, 1482 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); 1483 else 1484 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1485 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, 1486 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, 1487 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, 1488 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, 1489 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, 1490 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); 1491 break; 1492 case NVPTXISD::LoadV2: 1493 case NVPTXISD::LDGV2: 1494 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1495 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, 1496 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, 1497 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, 1498 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, 1499 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, 1500 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); 1501 break; 1502 case NVPTXISD::LDUV2: 1503 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1504 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, 1505 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, 1506 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, 1507 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, 1508 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, 1509 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); 1510 break; 1511 case NVPTXISD::LoadV4: 1512 case NVPTXISD::LDGV4: 1513 Opcode = pickOpcodeForVT( 1514 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, 1515 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, 1516 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt, 1517 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt); 1518 break; 1519 case NVPTXISD::LDUV4: 1520 Opcode = pickOpcodeForVT( 1521 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, 1522 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, 1523 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt, 1524 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt); 1525 break; 1526 } 1527 } else { 1528 switch (N->getOpcode()) { 1529 default: 1530 return false; 1531 case ISD::LOAD: 1532 case ISD::INTRINSIC_W_CHAIN: 1533 if (IsLDG) 1534 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1535 NVPTX::INT_PTX_LDG_GLOBAL_i8areg, 1536 NVPTX::INT_PTX_LDG_GLOBAL_i16areg, 1537 NVPTX::INT_PTX_LDG_GLOBAL_i32areg, 1538 NVPTX::INT_PTX_LDG_GLOBAL_i64areg, 1539 NVPTX::INT_PTX_LDG_GLOBAL_f32areg, 1540 NVPTX::INT_PTX_LDG_GLOBAL_f64areg); 1541 else 1542 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1543 NVPTX::INT_PTX_LDU_GLOBAL_i8areg, 1544 NVPTX::INT_PTX_LDU_GLOBAL_i16areg, 1545 NVPTX::INT_PTX_LDU_GLOBAL_i32areg, 1546 NVPTX::INT_PTX_LDU_GLOBAL_i64areg, 1547 NVPTX::INT_PTX_LDU_GLOBAL_f32areg, 1548 NVPTX::INT_PTX_LDU_GLOBAL_f64areg); 1549 break; 1550 case NVPTXISD::LoadV2: 1551 case NVPTXISD::LDGV2: 1552 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1553 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, 1554 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, 1555 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, 1556 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, 1557 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, 1558 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); 1559 break; 1560 case NVPTXISD::LDUV2: 1561 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1562 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, 1563 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, 1564 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, 1565 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, 1566 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, 1567 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); 1568 break; 1569 case NVPTXISD::LoadV4: 1570 case NVPTXISD::LDGV4: 1571 Opcode = pickOpcodeForVT( 1572 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, 1573 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, 1574 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt, 1575 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt); 1576 break; 1577 case NVPTXISD::LDUV4: 1578 Opcode = pickOpcodeForVT( 1579 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, 1580 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, 1581 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt, 1582 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt); 1583 break; 1584 } 1585 } 1586 if (!Opcode) 1587 return false; 1588 SDValue Ops[] = { Op1, Chain }; 1589 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1590 } 1591 1592 // For automatic generation of LDG (through SelectLoad[Vector], not the 1593 // intrinsics), we may have an extending load like: 1594 // 1595 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 1596 // 1597 // In this case, the matching logic above will select a load for the original 1598 // memory type (in this case, i8) and our types will not match (the node needs 1599 // to return an i32 in this case). Our LDG/LDU nodes do not support the 1600 // concept of sign-/zero-extension, so emulate it here by adding an explicit 1601 // CVT instruction. Ptxas should clean up any redundancies here. 1602 1603 EVT OrigType = N->getValueType(0); 1604 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); 1605 1606 if (OrigType != EltVT && 1607 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { 1608 // We have an extending-load. The instruction we selected operates on the 1609 // smaller type, but the SDNode we are replacing has the larger type. We 1610 // need to emit a CVT to make the types match. 1611 unsigned CvtOpc = 1612 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); 1613 1614 // For each output value, apply the manual sign/zero-extension and make sure 1615 // all users of the load go through that CVT. 1616 for (unsigned i = 0; i != NumElts; ++i) { 1617 SDValue Res(LD, i); 1618 SDValue OrigVal(N, i); 1619 1620 SDNode *CvtNode = 1621 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, 1622 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, 1623 DL, MVT::i32)); 1624 ReplaceUses(OrigVal, SDValue(CvtNode, 0)); 1625 } 1626 } 1627 1628 ReplaceNode(N, LD); 1629 return true; 1630 } 1631 1632 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { 1633 SDLoc dl(N); 1634 MemSDNode *ST = cast<MemSDNode>(N); 1635 assert(ST->writeMem() && "Expected store"); 1636 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); 1637 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); 1638 assert((PlainStore || AtomicStore) && "Expected store"); 1639 EVT StoreVT = ST->getMemoryVT(); 1640 SDNode *NVPTXST = nullptr; 1641 1642 // do not support pre/post inc/dec 1643 if (PlainStore && PlainStore->isIndexed()) 1644 return false; 1645 1646 if (!StoreVT.isSimple()) 1647 return false; 1648 1649 AtomicOrdering Ordering = ST->getSuccessOrdering(); 1650 // In order to lower atomic loads with stronger guarantees we would need to 1651 // use store.release or insert fences. However these features were only added 1652 // with PTX ISA 6.0 / sm_70. 1653 // TODO: Check if we can actually use the new instructions and implement them. 1654 if (isStrongerThanMonotonic(Ordering)) 1655 return false; 1656 1657 // Address Space Setting 1658 unsigned int CodeAddrSpace = getCodeAddrSpace(ST); 1659 unsigned int PointerSize = 1660 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); 1661 1662 // Volatile Setting 1663 // - .volatile is only available for .global and .shared 1664 // - .volatile has the same memory synchronization semantics as .relaxed.sys 1665 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; 1666 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1667 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1668 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1669 isVolatile = false; 1670 1671 // Vector Setting 1672 MVT SimpleVT = StoreVT.getSimpleVT(); 1673 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 1674 1675 // Type Setting: toType + toTypeWidth 1676 // - for integer type, always use 'u' 1677 // 1678 MVT ScalarVT = SimpleVT.getScalarType(); 1679 unsigned toTypeWidth = ScalarVT.getSizeInBits(); 1680 if (SimpleVT.isVector()) { 1681 assert((StoreVT == MVT::v2f16 || StoreVT == MVT::v2bf16) && 1682 "Unexpected vector type"); 1683 // v2f16 is stored using st.b32 1684 toTypeWidth = 32; 1685 } 1686 1687 unsigned int toType = getLdStRegType(ScalarVT); 1688 1689 // Create the machine instruction DAG 1690 SDValue Chain = ST->getChain(); 1691 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); 1692 SDValue BasePtr = ST->getBasePtr(); 1693 SDValue Addr; 1694 SDValue Offset, Base; 1695 std::optional<unsigned> Opcode; 1696 MVT::SimpleValueType SourceVT = 1697 Value.getNode()->getSimpleValueType(0).SimpleTy; 1698 1699 if (SelectDirectAddr(BasePtr, Addr)) { 1700 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, 1701 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, 1702 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); 1703 if (!Opcode) 1704 return false; 1705 SDValue Ops[] = {Value, 1706 getI32Imm(isVolatile, dl), 1707 getI32Imm(CodeAddrSpace, dl), 1708 getI32Imm(vecType, dl), 1709 getI32Imm(toType, dl), 1710 getI32Imm(toTypeWidth, dl), 1711 Addr, 1712 Chain}; 1713 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1714 } else if (PointerSize == 64 1715 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) 1716 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { 1717 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, 1718 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, 1719 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); 1720 if (!Opcode) 1721 return false; 1722 SDValue Ops[] = {Value, 1723 getI32Imm(isVolatile, dl), 1724 getI32Imm(CodeAddrSpace, dl), 1725 getI32Imm(vecType, dl), 1726 getI32Imm(toType, dl), 1727 getI32Imm(toTypeWidth, dl), 1728 Base, 1729 Offset, 1730 Chain}; 1731 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1732 } else if (PointerSize == 64 1733 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) 1734 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { 1735 if (PointerSize == 64) 1736 Opcode = 1737 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, 1738 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, 1739 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); 1740 else 1741 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, 1742 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, 1743 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); 1744 if (!Opcode) 1745 return false; 1746 1747 SDValue Ops[] = {Value, 1748 getI32Imm(isVolatile, dl), 1749 getI32Imm(CodeAddrSpace, dl), 1750 getI32Imm(vecType, dl), 1751 getI32Imm(toType, dl), 1752 getI32Imm(toTypeWidth, dl), 1753 Base, 1754 Offset, 1755 Chain}; 1756 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1757 } else { 1758 if (PointerSize == 64) 1759 Opcode = 1760 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, 1761 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, 1762 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); 1763 else 1764 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, 1765 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, 1766 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); 1767 if (!Opcode) 1768 return false; 1769 SDValue Ops[] = {Value, 1770 getI32Imm(isVolatile, dl), 1771 getI32Imm(CodeAddrSpace, dl), 1772 getI32Imm(vecType, dl), 1773 getI32Imm(toType, dl), 1774 getI32Imm(toTypeWidth, dl), 1775 BasePtr, 1776 Chain}; 1777 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1778 } 1779 1780 if (!NVPTXST) 1781 return false; 1782 1783 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1784 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef}); 1785 ReplaceNode(N, NVPTXST); 1786 return true; 1787 } 1788 1789 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { 1790 SDValue Chain = N->getOperand(0); 1791 SDValue Op1 = N->getOperand(1); 1792 SDValue Addr, Offset, Base; 1793 std::optional<unsigned> Opcode; 1794 SDLoc DL(N); 1795 SDNode *ST; 1796 EVT EltVT = Op1.getValueType(); 1797 MemSDNode *MemSD = cast<MemSDNode>(N); 1798 EVT StoreVT = MemSD->getMemoryVT(); 1799 1800 // Address Space Setting 1801 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); 1802 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { 1803 report_fatal_error("Cannot store to pointer that points to constant " 1804 "memory space"); 1805 } 1806 unsigned int PointerSize = 1807 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1808 1809 // Volatile Setting 1810 // - .volatile is only availalble for .global and .shared 1811 bool IsVolatile = MemSD->isVolatile(); 1812 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1813 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1814 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1815 IsVolatile = false; 1816 1817 // Type Setting: toType + toTypeWidth 1818 // - for integer type, always use 'u' 1819 assert(StoreVT.isSimple() && "Store value is not simple"); 1820 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); 1821 unsigned ToTypeWidth = ScalarVT.getSizeInBits(); 1822 unsigned ToType = getLdStRegType(ScalarVT); 1823 1824 SmallVector<SDValue, 12> StOps; 1825 SDValue N2; 1826 unsigned VecType; 1827 1828 switch (N->getOpcode()) { 1829 case NVPTXISD::StoreV2: 1830 VecType = NVPTX::PTXLdStInstCode::V2; 1831 StOps.push_back(N->getOperand(1)); 1832 StOps.push_back(N->getOperand(2)); 1833 N2 = N->getOperand(3); 1834 break; 1835 case NVPTXISD::StoreV4: 1836 VecType = NVPTX::PTXLdStInstCode::V4; 1837 StOps.push_back(N->getOperand(1)); 1838 StOps.push_back(N->getOperand(2)); 1839 StOps.push_back(N->getOperand(3)); 1840 StOps.push_back(N->getOperand(4)); 1841 N2 = N->getOperand(5); 1842 break; 1843 default: 1844 return false; 1845 } 1846 1847 // v8f16 is a special case. PTX doesn't have st.v8.f16 1848 // instruction. Instead, we split the vector into v2f16 chunks and 1849 // store them with st.v4.b32. 1850 if (EltVT == MVT::v2f16 || EltVT == MVT::v2bf16) { 1851 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); 1852 EltVT = MVT::i32; 1853 ToType = NVPTX::PTXLdStInstCode::Untyped; 1854 ToTypeWidth = 32; 1855 } 1856 1857 StOps.push_back(getI32Imm(IsVolatile, DL)); 1858 StOps.push_back(getI32Imm(CodeAddrSpace, DL)); 1859 StOps.push_back(getI32Imm(VecType, DL)); 1860 StOps.push_back(getI32Imm(ToType, DL)); 1861 StOps.push_back(getI32Imm(ToTypeWidth, DL)); 1862 1863 if (SelectDirectAddr(N2, Addr)) { 1864 switch (N->getOpcode()) { 1865 default: 1866 return false; 1867 case NVPTXISD::StoreV2: 1868 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1869 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, 1870 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, 1871 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); 1872 break; 1873 case NVPTXISD::StoreV4: 1874 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1875 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar, 1876 NVPTX::STV_i32_v4_avar, std::nullopt, 1877 NVPTX::STV_f32_v4_avar, std::nullopt); 1878 break; 1879 } 1880 StOps.push_back(Addr); 1881 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) 1882 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { 1883 switch (N->getOpcode()) { 1884 default: 1885 return false; 1886 case NVPTXISD::StoreV2: 1887 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1888 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, 1889 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, 1890 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); 1891 break; 1892 case NVPTXISD::StoreV4: 1893 Opcode = 1894 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, 1895 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, 1896 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); 1897 break; 1898 } 1899 StOps.push_back(Base); 1900 StOps.push_back(Offset); 1901 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) 1902 : SelectADDRri(N2.getNode(), N2, Base, Offset)) { 1903 if (PointerSize == 64) { 1904 switch (N->getOpcode()) { 1905 default: 1906 return false; 1907 case NVPTXISD::StoreV2: 1908 Opcode = 1909 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1910 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64, 1911 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64, 1912 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64); 1913 break; 1914 case NVPTXISD::StoreV4: 1915 Opcode = pickOpcodeForVT( 1916 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, 1917 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt, 1918 NVPTX::STV_f32_v4_ari_64, std::nullopt); 1919 break; 1920 } 1921 } else { 1922 switch (N->getOpcode()) { 1923 default: 1924 return false; 1925 case NVPTXISD::StoreV2: 1926 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1927 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, 1928 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, 1929 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); 1930 break; 1931 case NVPTXISD::StoreV4: 1932 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1933 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari, 1934 NVPTX::STV_i32_v4_ari, std::nullopt, 1935 NVPTX::STV_f32_v4_ari, std::nullopt); 1936 break; 1937 } 1938 } 1939 StOps.push_back(Base); 1940 StOps.push_back(Offset); 1941 } else { 1942 if (PointerSize == 64) { 1943 switch (N->getOpcode()) { 1944 default: 1945 return false; 1946 case NVPTXISD::StoreV2: 1947 Opcode = pickOpcodeForVT( 1948 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, 1949 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, 1950 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64, 1951 NVPTX::STV_f64_v2_areg_64); 1952 break; 1953 case NVPTXISD::StoreV4: 1954 Opcode = pickOpcodeForVT( 1955 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, 1956 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt, 1957 NVPTX::STV_f32_v4_areg_64, std::nullopt); 1958 break; 1959 } 1960 } else { 1961 switch (N->getOpcode()) { 1962 default: 1963 return false; 1964 case NVPTXISD::StoreV2: 1965 Opcode = 1966 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, 1967 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, 1968 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg, 1969 NVPTX::STV_f64_v2_areg); 1970 break; 1971 case NVPTXISD::StoreV4: 1972 Opcode = 1973 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, 1974 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, 1975 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt); 1976 break; 1977 } 1978 } 1979 StOps.push_back(N2); 1980 } 1981 1982 if (!Opcode) 1983 return false; 1984 1985 StOps.push_back(Chain); 1986 1987 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); 1988 1989 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1990 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); 1991 1992 ReplaceNode(N, ST); 1993 return true; 1994 } 1995 1996 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { 1997 SDValue Chain = Node->getOperand(0); 1998 SDValue Offset = Node->getOperand(2); 1999 SDValue Glue = Node->getOperand(3); 2000 SDLoc DL(Node); 2001 MemSDNode *Mem = cast<MemSDNode>(Node); 2002 2003 unsigned VecSize; 2004 switch (Node->getOpcode()) { 2005 default: 2006 return false; 2007 case NVPTXISD::LoadParam: 2008 VecSize = 1; 2009 break; 2010 case NVPTXISD::LoadParamV2: 2011 VecSize = 2; 2012 break; 2013 case NVPTXISD::LoadParamV4: 2014 VecSize = 4; 2015 break; 2016 } 2017 2018 EVT EltVT = Node->getValueType(0); 2019 EVT MemVT = Mem->getMemoryVT(); 2020 2021 std::optional<unsigned> Opcode; 2022 2023 switch (VecSize) { 2024 default: 2025 return false; 2026 case 1: 2027 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, 2028 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, 2029 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, 2030 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); 2031 break; 2032 case 2: 2033 Opcode = 2034 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, 2035 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, 2036 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32, 2037 NVPTX::LoadParamMemV2F64); 2038 break; 2039 case 4: 2040 Opcode = 2041 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, 2042 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, 2043 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); 2044 break; 2045 } 2046 if (!Opcode) 2047 return false; 2048 2049 SDVTList VTs; 2050 if (VecSize == 1) { 2051 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); 2052 } else if (VecSize == 2) { 2053 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); 2054 } else { 2055 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; 2056 VTs = CurDAG->getVTList(EVTs); 2057 } 2058 2059 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2060 2061 SmallVector<SDValue, 2> Ops; 2062 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2063 Ops.push_back(Chain); 2064 Ops.push_back(Glue); 2065 2066 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); 2067 return true; 2068 } 2069 2070 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { 2071 SDLoc DL(N); 2072 SDValue Chain = N->getOperand(0); 2073 SDValue Offset = N->getOperand(1); 2074 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2075 MemSDNode *Mem = cast<MemSDNode>(N); 2076 2077 // How many elements do we have? 2078 unsigned NumElts = 1; 2079 switch (N->getOpcode()) { 2080 default: 2081 return false; 2082 case NVPTXISD::StoreRetval: 2083 NumElts = 1; 2084 break; 2085 case NVPTXISD::StoreRetvalV2: 2086 NumElts = 2; 2087 break; 2088 case NVPTXISD::StoreRetvalV4: 2089 NumElts = 4; 2090 break; 2091 } 2092 2093 // Build vector of operands 2094 SmallVector<SDValue, 6> Ops; 2095 for (unsigned i = 0; i < NumElts; ++i) 2096 Ops.push_back(N->getOperand(i + 2)); 2097 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2098 Ops.push_back(Chain); 2099 2100 // Determine target opcode 2101 // If we have an i1, use an 8-bit store. The lowering code in 2102 // NVPTXISelLowering will have already emitted an upcast. 2103 std::optional<unsigned> Opcode = 0; 2104 switch (NumElts) { 2105 default: 2106 return false; 2107 case 1: 2108 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2109 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, 2110 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, 2111 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); 2112 break; 2113 case 2: 2114 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2115 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, 2116 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, 2117 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); 2118 break; 2119 case 4: 2120 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2121 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, 2122 NVPTX::StoreRetvalV4I32, std::nullopt, 2123 NVPTX::StoreRetvalV4F32, std::nullopt); 2124 break; 2125 } 2126 if (!Opcode) 2127 return false; 2128 2129 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); 2130 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2131 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2132 2133 ReplaceNode(N, Ret); 2134 return true; 2135 } 2136 2137 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { 2138 SDLoc DL(N); 2139 SDValue Chain = N->getOperand(0); 2140 SDValue Param = N->getOperand(1); 2141 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue(); 2142 SDValue Offset = N->getOperand(2); 2143 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2144 MemSDNode *Mem = cast<MemSDNode>(N); 2145 SDValue Glue = N->getOperand(N->getNumOperands() - 1); 2146 2147 // How many elements do we have? 2148 unsigned NumElts = 1; 2149 switch (N->getOpcode()) { 2150 default: 2151 return false; 2152 case NVPTXISD::StoreParamU32: 2153 case NVPTXISD::StoreParamS32: 2154 case NVPTXISD::StoreParam: 2155 NumElts = 1; 2156 break; 2157 case NVPTXISD::StoreParamV2: 2158 NumElts = 2; 2159 break; 2160 case NVPTXISD::StoreParamV4: 2161 NumElts = 4; 2162 break; 2163 } 2164 2165 // Build vector of operands 2166 SmallVector<SDValue, 8> Ops; 2167 for (unsigned i = 0; i < NumElts; ++i) 2168 Ops.push_back(N->getOperand(i + 3)); 2169 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); 2170 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2171 Ops.push_back(Chain); 2172 Ops.push_back(Glue); 2173 2174 // Determine target opcode 2175 // If we have an i1, use an 8-bit store. The lowering code in 2176 // NVPTXISelLowering will have already emitted an upcast. 2177 std::optional<unsigned> Opcode = 0; 2178 switch (N->getOpcode()) { 2179 default: 2180 switch (NumElts) { 2181 default: 2182 return false; 2183 case 1: 2184 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2185 NVPTX::StoreParamI8, NVPTX::StoreParamI16, 2186 NVPTX::StoreParamI32, NVPTX::StoreParamI64, 2187 NVPTX::StoreParamF32, NVPTX::StoreParamF64); 2188 break; 2189 case 2: 2190 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2191 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16, 2192 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64, 2193 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64); 2194 break; 2195 case 4: 2196 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2197 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16, 2198 NVPTX::StoreParamV4I32, std::nullopt, 2199 NVPTX::StoreParamV4F32, std::nullopt); 2200 break; 2201 } 2202 if (!Opcode) 2203 return false; 2204 break; 2205 // Special case: if we have a sign-extend/zero-extend node, insert the 2206 // conversion instruction first, and use that as the value operand to 2207 // the selected StoreParam node. 2208 case NVPTXISD::StoreParamU32: { 2209 Opcode = NVPTX::StoreParamI32; 2210 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2211 MVT::i32); 2212 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, 2213 MVT::i32, Ops[0], CvtNone); 2214 Ops[0] = SDValue(Cvt, 0); 2215 break; 2216 } 2217 case NVPTXISD::StoreParamS32: { 2218 Opcode = NVPTX::StoreParamI32; 2219 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2220 MVT::i32); 2221 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, 2222 MVT::i32, Ops[0], CvtNone); 2223 Ops[0] = SDValue(Cvt, 0); 2224 break; 2225 } 2226 } 2227 2228 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 2229 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); 2230 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2231 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2232 2233 ReplaceNode(N, Ret); 2234 return true; 2235 } 2236 2237 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { 2238 unsigned Opc = 0; 2239 2240 switch (N->getOpcode()) { 2241 default: return false; 2242 case NVPTXISD::Tex1DFloatS32: 2243 Opc = NVPTX::TEX_1D_F32_S32_RR; 2244 break; 2245 case NVPTXISD::Tex1DFloatFloat: 2246 Opc = NVPTX::TEX_1D_F32_F32_RR; 2247 break; 2248 case NVPTXISD::Tex1DFloatFloatLevel: 2249 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; 2250 break; 2251 case NVPTXISD::Tex1DFloatFloatGrad: 2252 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; 2253 break; 2254 case NVPTXISD::Tex1DS32S32: 2255 Opc = NVPTX::TEX_1D_S32_S32_RR; 2256 break; 2257 case NVPTXISD::Tex1DS32Float: 2258 Opc = NVPTX::TEX_1D_S32_F32_RR; 2259 break; 2260 case NVPTXISD::Tex1DS32FloatLevel: 2261 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; 2262 break; 2263 case NVPTXISD::Tex1DS32FloatGrad: 2264 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; 2265 break; 2266 case NVPTXISD::Tex1DU32S32: 2267 Opc = NVPTX::TEX_1D_U32_S32_RR; 2268 break; 2269 case NVPTXISD::Tex1DU32Float: 2270 Opc = NVPTX::TEX_1D_U32_F32_RR; 2271 break; 2272 case NVPTXISD::Tex1DU32FloatLevel: 2273 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; 2274 break; 2275 case NVPTXISD::Tex1DU32FloatGrad: 2276 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; 2277 break; 2278 case NVPTXISD::Tex1DArrayFloatS32: 2279 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; 2280 break; 2281 case NVPTXISD::Tex1DArrayFloatFloat: 2282 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; 2283 break; 2284 case NVPTXISD::Tex1DArrayFloatFloatLevel: 2285 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; 2286 break; 2287 case NVPTXISD::Tex1DArrayFloatFloatGrad: 2288 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; 2289 break; 2290 case NVPTXISD::Tex1DArrayS32S32: 2291 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; 2292 break; 2293 case NVPTXISD::Tex1DArrayS32Float: 2294 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; 2295 break; 2296 case NVPTXISD::Tex1DArrayS32FloatLevel: 2297 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; 2298 break; 2299 case NVPTXISD::Tex1DArrayS32FloatGrad: 2300 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; 2301 break; 2302 case NVPTXISD::Tex1DArrayU32S32: 2303 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; 2304 break; 2305 case NVPTXISD::Tex1DArrayU32Float: 2306 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; 2307 break; 2308 case NVPTXISD::Tex1DArrayU32FloatLevel: 2309 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; 2310 break; 2311 case NVPTXISD::Tex1DArrayU32FloatGrad: 2312 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; 2313 break; 2314 case NVPTXISD::Tex2DFloatS32: 2315 Opc = NVPTX::TEX_2D_F32_S32_RR; 2316 break; 2317 case NVPTXISD::Tex2DFloatFloat: 2318 Opc = NVPTX::TEX_2D_F32_F32_RR; 2319 break; 2320 case NVPTXISD::Tex2DFloatFloatLevel: 2321 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; 2322 break; 2323 case NVPTXISD::Tex2DFloatFloatGrad: 2324 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; 2325 break; 2326 case NVPTXISD::Tex2DS32S32: 2327 Opc = NVPTX::TEX_2D_S32_S32_RR; 2328 break; 2329 case NVPTXISD::Tex2DS32Float: 2330 Opc = NVPTX::TEX_2D_S32_F32_RR; 2331 break; 2332 case NVPTXISD::Tex2DS32FloatLevel: 2333 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; 2334 break; 2335 case NVPTXISD::Tex2DS32FloatGrad: 2336 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; 2337 break; 2338 case NVPTXISD::Tex2DU32S32: 2339 Opc = NVPTX::TEX_2D_U32_S32_RR; 2340 break; 2341 case NVPTXISD::Tex2DU32Float: 2342 Opc = NVPTX::TEX_2D_U32_F32_RR; 2343 break; 2344 case NVPTXISD::Tex2DU32FloatLevel: 2345 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; 2346 break; 2347 case NVPTXISD::Tex2DU32FloatGrad: 2348 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; 2349 break; 2350 case NVPTXISD::Tex2DArrayFloatS32: 2351 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; 2352 break; 2353 case NVPTXISD::Tex2DArrayFloatFloat: 2354 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; 2355 break; 2356 case NVPTXISD::Tex2DArrayFloatFloatLevel: 2357 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; 2358 break; 2359 case NVPTXISD::Tex2DArrayFloatFloatGrad: 2360 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; 2361 break; 2362 case NVPTXISD::Tex2DArrayS32S32: 2363 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; 2364 break; 2365 case NVPTXISD::Tex2DArrayS32Float: 2366 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; 2367 break; 2368 case NVPTXISD::Tex2DArrayS32FloatLevel: 2369 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; 2370 break; 2371 case NVPTXISD::Tex2DArrayS32FloatGrad: 2372 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; 2373 break; 2374 case NVPTXISD::Tex2DArrayU32S32: 2375 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; 2376 break; 2377 case NVPTXISD::Tex2DArrayU32Float: 2378 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; 2379 break; 2380 case NVPTXISD::Tex2DArrayU32FloatLevel: 2381 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; 2382 break; 2383 case NVPTXISD::Tex2DArrayU32FloatGrad: 2384 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; 2385 break; 2386 case NVPTXISD::Tex3DFloatS32: 2387 Opc = NVPTX::TEX_3D_F32_S32_RR; 2388 break; 2389 case NVPTXISD::Tex3DFloatFloat: 2390 Opc = NVPTX::TEX_3D_F32_F32_RR; 2391 break; 2392 case NVPTXISD::Tex3DFloatFloatLevel: 2393 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; 2394 break; 2395 case NVPTXISD::Tex3DFloatFloatGrad: 2396 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; 2397 break; 2398 case NVPTXISD::Tex3DS32S32: 2399 Opc = NVPTX::TEX_3D_S32_S32_RR; 2400 break; 2401 case NVPTXISD::Tex3DS32Float: 2402 Opc = NVPTX::TEX_3D_S32_F32_RR; 2403 break; 2404 case NVPTXISD::Tex3DS32FloatLevel: 2405 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; 2406 break; 2407 case NVPTXISD::Tex3DS32FloatGrad: 2408 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; 2409 break; 2410 case NVPTXISD::Tex3DU32S32: 2411 Opc = NVPTX::TEX_3D_U32_S32_RR; 2412 break; 2413 case NVPTXISD::Tex3DU32Float: 2414 Opc = NVPTX::TEX_3D_U32_F32_RR; 2415 break; 2416 case NVPTXISD::Tex3DU32FloatLevel: 2417 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; 2418 break; 2419 case NVPTXISD::Tex3DU32FloatGrad: 2420 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; 2421 break; 2422 case NVPTXISD::TexCubeFloatFloat: 2423 Opc = NVPTX::TEX_CUBE_F32_F32_RR; 2424 break; 2425 case NVPTXISD::TexCubeFloatFloatLevel: 2426 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; 2427 break; 2428 case NVPTXISD::TexCubeS32Float: 2429 Opc = NVPTX::TEX_CUBE_S32_F32_RR; 2430 break; 2431 case NVPTXISD::TexCubeS32FloatLevel: 2432 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; 2433 break; 2434 case NVPTXISD::TexCubeU32Float: 2435 Opc = NVPTX::TEX_CUBE_U32_F32_RR; 2436 break; 2437 case NVPTXISD::TexCubeU32FloatLevel: 2438 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; 2439 break; 2440 case NVPTXISD::TexCubeArrayFloatFloat: 2441 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; 2442 break; 2443 case NVPTXISD::TexCubeArrayFloatFloatLevel: 2444 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; 2445 break; 2446 case NVPTXISD::TexCubeArrayS32Float: 2447 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; 2448 break; 2449 case NVPTXISD::TexCubeArrayS32FloatLevel: 2450 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; 2451 break; 2452 case NVPTXISD::TexCubeArrayU32Float: 2453 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; 2454 break; 2455 case NVPTXISD::TexCubeArrayU32FloatLevel: 2456 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; 2457 break; 2458 case NVPTXISD::Tld4R2DFloatFloat: 2459 Opc = NVPTX::TLD4_R_2D_F32_F32_RR; 2460 break; 2461 case NVPTXISD::Tld4G2DFloatFloat: 2462 Opc = NVPTX::TLD4_G_2D_F32_F32_RR; 2463 break; 2464 case NVPTXISD::Tld4B2DFloatFloat: 2465 Opc = NVPTX::TLD4_B_2D_F32_F32_RR; 2466 break; 2467 case NVPTXISD::Tld4A2DFloatFloat: 2468 Opc = NVPTX::TLD4_A_2D_F32_F32_RR; 2469 break; 2470 case NVPTXISD::Tld4R2DS64Float: 2471 Opc = NVPTX::TLD4_R_2D_S32_F32_RR; 2472 break; 2473 case NVPTXISD::Tld4G2DS64Float: 2474 Opc = NVPTX::TLD4_G_2D_S32_F32_RR; 2475 break; 2476 case NVPTXISD::Tld4B2DS64Float: 2477 Opc = NVPTX::TLD4_B_2D_S32_F32_RR; 2478 break; 2479 case NVPTXISD::Tld4A2DS64Float: 2480 Opc = NVPTX::TLD4_A_2D_S32_F32_RR; 2481 break; 2482 case NVPTXISD::Tld4R2DU64Float: 2483 Opc = NVPTX::TLD4_R_2D_U32_F32_RR; 2484 break; 2485 case NVPTXISD::Tld4G2DU64Float: 2486 Opc = NVPTX::TLD4_G_2D_U32_F32_RR; 2487 break; 2488 case NVPTXISD::Tld4B2DU64Float: 2489 Opc = NVPTX::TLD4_B_2D_U32_F32_RR; 2490 break; 2491 case NVPTXISD::Tld4A2DU64Float: 2492 Opc = NVPTX::TLD4_A_2D_U32_F32_RR; 2493 break; 2494 case NVPTXISD::TexUnified1DFloatS32: 2495 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; 2496 break; 2497 case NVPTXISD::TexUnified1DFloatFloat: 2498 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; 2499 break; 2500 case NVPTXISD::TexUnified1DFloatFloatLevel: 2501 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; 2502 break; 2503 case NVPTXISD::TexUnified1DFloatFloatGrad: 2504 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; 2505 break; 2506 case NVPTXISD::TexUnified1DS32S32: 2507 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; 2508 break; 2509 case NVPTXISD::TexUnified1DS32Float: 2510 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; 2511 break; 2512 case NVPTXISD::TexUnified1DS32FloatLevel: 2513 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; 2514 break; 2515 case NVPTXISD::TexUnified1DS32FloatGrad: 2516 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; 2517 break; 2518 case NVPTXISD::TexUnified1DU32S32: 2519 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; 2520 break; 2521 case NVPTXISD::TexUnified1DU32Float: 2522 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; 2523 break; 2524 case NVPTXISD::TexUnified1DU32FloatLevel: 2525 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; 2526 break; 2527 case NVPTXISD::TexUnified1DU32FloatGrad: 2528 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; 2529 break; 2530 case NVPTXISD::TexUnified1DArrayFloatS32: 2531 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; 2532 break; 2533 case NVPTXISD::TexUnified1DArrayFloatFloat: 2534 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; 2535 break; 2536 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 2537 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; 2538 break; 2539 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 2540 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; 2541 break; 2542 case NVPTXISD::TexUnified1DArrayS32S32: 2543 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; 2544 break; 2545 case NVPTXISD::TexUnified1DArrayS32Float: 2546 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; 2547 break; 2548 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 2549 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; 2550 break; 2551 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 2552 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; 2553 break; 2554 case NVPTXISD::TexUnified1DArrayU32S32: 2555 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; 2556 break; 2557 case NVPTXISD::TexUnified1DArrayU32Float: 2558 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; 2559 break; 2560 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 2561 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; 2562 break; 2563 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 2564 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; 2565 break; 2566 case NVPTXISD::TexUnified2DFloatS32: 2567 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; 2568 break; 2569 case NVPTXISD::TexUnified2DFloatFloat: 2570 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; 2571 break; 2572 case NVPTXISD::TexUnified2DFloatFloatLevel: 2573 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; 2574 break; 2575 case NVPTXISD::TexUnified2DFloatFloatGrad: 2576 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; 2577 break; 2578 case NVPTXISD::TexUnified2DS32S32: 2579 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; 2580 break; 2581 case NVPTXISD::TexUnified2DS32Float: 2582 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; 2583 break; 2584 case NVPTXISD::TexUnified2DS32FloatLevel: 2585 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; 2586 break; 2587 case NVPTXISD::TexUnified2DS32FloatGrad: 2588 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; 2589 break; 2590 case NVPTXISD::TexUnified2DU32S32: 2591 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; 2592 break; 2593 case NVPTXISD::TexUnified2DU32Float: 2594 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; 2595 break; 2596 case NVPTXISD::TexUnified2DU32FloatLevel: 2597 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; 2598 break; 2599 case NVPTXISD::TexUnified2DU32FloatGrad: 2600 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; 2601 break; 2602 case NVPTXISD::TexUnified2DArrayFloatS32: 2603 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; 2604 break; 2605 case NVPTXISD::TexUnified2DArrayFloatFloat: 2606 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; 2607 break; 2608 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 2609 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; 2610 break; 2611 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 2612 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; 2613 break; 2614 case NVPTXISD::TexUnified2DArrayS32S32: 2615 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; 2616 break; 2617 case NVPTXISD::TexUnified2DArrayS32Float: 2618 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; 2619 break; 2620 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 2621 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; 2622 break; 2623 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 2624 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; 2625 break; 2626 case NVPTXISD::TexUnified2DArrayU32S32: 2627 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; 2628 break; 2629 case NVPTXISD::TexUnified2DArrayU32Float: 2630 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; 2631 break; 2632 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 2633 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; 2634 break; 2635 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 2636 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; 2637 break; 2638 case NVPTXISD::TexUnified3DFloatS32: 2639 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; 2640 break; 2641 case NVPTXISD::TexUnified3DFloatFloat: 2642 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; 2643 break; 2644 case NVPTXISD::TexUnified3DFloatFloatLevel: 2645 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; 2646 break; 2647 case NVPTXISD::TexUnified3DFloatFloatGrad: 2648 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; 2649 break; 2650 case NVPTXISD::TexUnified3DS32S32: 2651 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; 2652 break; 2653 case NVPTXISD::TexUnified3DS32Float: 2654 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; 2655 break; 2656 case NVPTXISD::TexUnified3DS32FloatLevel: 2657 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; 2658 break; 2659 case NVPTXISD::TexUnified3DS32FloatGrad: 2660 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; 2661 break; 2662 case NVPTXISD::TexUnified3DU32S32: 2663 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; 2664 break; 2665 case NVPTXISD::TexUnified3DU32Float: 2666 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; 2667 break; 2668 case NVPTXISD::TexUnified3DU32FloatLevel: 2669 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; 2670 break; 2671 case NVPTXISD::TexUnified3DU32FloatGrad: 2672 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; 2673 break; 2674 case NVPTXISD::TexUnifiedCubeFloatFloat: 2675 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; 2676 break; 2677 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 2678 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; 2679 break; 2680 case NVPTXISD::TexUnifiedCubeS32Float: 2681 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; 2682 break; 2683 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 2684 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; 2685 break; 2686 case NVPTXISD::TexUnifiedCubeU32Float: 2687 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; 2688 break; 2689 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 2690 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; 2691 break; 2692 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 2693 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; 2694 break; 2695 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 2696 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; 2697 break; 2698 case NVPTXISD::TexUnifiedCubeArrayS32Float: 2699 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; 2700 break; 2701 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 2702 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; 2703 break; 2704 case NVPTXISD::TexUnifiedCubeArrayU32Float: 2705 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; 2706 break; 2707 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 2708 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; 2709 break; 2710 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 2711 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; 2712 break; 2713 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 2714 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; 2715 break; 2716 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 2717 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; 2718 break; 2719 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 2720 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; 2721 break; 2722 case NVPTXISD::Tld4UnifiedR2DS64Float: 2723 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; 2724 break; 2725 case NVPTXISD::Tld4UnifiedG2DS64Float: 2726 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; 2727 break; 2728 case NVPTXISD::Tld4UnifiedB2DS64Float: 2729 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; 2730 break; 2731 case NVPTXISD::Tld4UnifiedA2DS64Float: 2732 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; 2733 break; 2734 case NVPTXISD::Tld4UnifiedR2DU64Float: 2735 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; 2736 break; 2737 case NVPTXISD::Tld4UnifiedG2DU64Float: 2738 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; 2739 break; 2740 case NVPTXISD::Tld4UnifiedB2DU64Float: 2741 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; 2742 break; 2743 case NVPTXISD::Tld4UnifiedA2DU64Float: 2744 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; 2745 break; 2746 } 2747 2748 // Copy over operands 2749 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 2750 Ops.push_back(N->getOperand(0)); // Move chain to the back. 2751 2752 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 2753 return true; 2754 } 2755 2756 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { 2757 unsigned Opc = 0; 2758 switch (N->getOpcode()) { 2759 default: return false; 2760 case NVPTXISD::Suld1DI8Clamp: 2761 Opc = NVPTX::SULD_1D_I8_CLAMP_R; 2762 break; 2763 case NVPTXISD::Suld1DI16Clamp: 2764 Opc = NVPTX::SULD_1D_I16_CLAMP_R; 2765 break; 2766 case NVPTXISD::Suld1DI32Clamp: 2767 Opc = NVPTX::SULD_1D_I32_CLAMP_R; 2768 break; 2769 case NVPTXISD::Suld1DI64Clamp: 2770 Opc = NVPTX::SULD_1D_I64_CLAMP_R; 2771 break; 2772 case NVPTXISD::Suld1DV2I8Clamp: 2773 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; 2774 break; 2775 case NVPTXISD::Suld1DV2I16Clamp: 2776 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; 2777 break; 2778 case NVPTXISD::Suld1DV2I32Clamp: 2779 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; 2780 break; 2781 case NVPTXISD::Suld1DV2I64Clamp: 2782 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; 2783 break; 2784 case NVPTXISD::Suld1DV4I8Clamp: 2785 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; 2786 break; 2787 case NVPTXISD::Suld1DV4I16Clamp: 2788 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; 2789 break; 2790 case NVPTXISD::Suld1DV4I32Clamp: 2791 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; 2792 break; 2793 case NVPTXISD::Suld1DArrayI8Clamp: 2794 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; 2795 break; 2796 case NVPTXISD::Suld1DArrayI16Clamp: 2797 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; 2798 break; 2799 case NVPTXISD::Suld1DArrayI32Clamp: 2800 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; 2801 break; 2802 case NVPTXISD::Suld1DArrayI64Clamp: 2803 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; 2804 break; 2805 case NVPTXISD::Suld1DArrayV2I8Clamp: 2806 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; 2807 break; 2808 case NVPTXISD::Suld1DArrayV2I16Clamp: 2809 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; 2810 break; 2811 case NVPTXISD::Suld1DArrayV2I32Clamp: 2812 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; 2813 break; 2814 case NVPTXISD::Suld1DArrayV2I64Clamp: 2815 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; 2816 break; 2817 case NVPTXISD::Suld1DArrayV4I8Clamp: 2818 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; 2819 break; 2820 case NVPTXISD::Suld1DArrayV4I16Clamp: 2821 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; 2822 break; 2823 case NVPTXISD::Suld1DArrayV4I32Clamp: 2824 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; 2825 break; 2826 case NVPTXISD::Suld2DI8Clamp: 2827 Opc = NVPTX::SULD_2D_I8_CLAMP_R; 2828 break; 2829 case NVPTXISD::Suld2DI16Clamp: 2830 Opc = NVPTX::SULD_2D_I16_CLAMP_R; 2831 break; 2832 case NVPTXISD::Suld2DI32Clamp: 2833 Opc = NVPTX::SULD_2D_I32_CLAMP_R; 2834 break; 2835 case NVPTXISD::Suld2DI64Clamp: 2836 Opc = NVPTX::SULD_2D_I64_CLAMP_R; 2837 break; 2838 case NVPTXISD::Suld2DV2I8Clamp: 2839 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; 2840 break; 2841 case NVPTXISD::Suld2DV2I16Clamp: 2842 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; 2843 break; 2844 case NVPTXISD::Suld2DV2I32Clamp: 2845 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; 2846 break; 2847 case NVPTXISD::Suld2DV2I64Clamp: 2848 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; 2849 break; 2850 case NVPTXISD::Suld2DV4I8Clamp: 2851 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; 2852 break; 2853 case NVPTXISD::Suld2DV4I16Clamp: 2854 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; 2855 break; 2856 case NVPTXISD::Suld2DV4I32Clamp: 2857 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; 2858 break; 2859 case NVPTXISD::Suld2DArrayI8Clamp: 2860 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; 2861 break; 2862 case NVPTXISD::Suld2DArrayI16Clamp: 2863 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; 2864 break; 2865 case NVPTXISD::Suld2DArrayI32Clamp: 2866 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; 2867 break; 2868 case NVPTXISD::Suld2DArrayI64Clamp: 2869 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; 2870 break; 2871 case NVPTXISD::Suld2DArrayV2I8Clamp: 2872 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; 2873 break; 2874 case NVPTXISD::Suld2DArrayV2I16Clamp: 2875 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; 2876 break; 2877 case NVPTXISD::Suld2DArrayV2I32Clamp: 2878 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; 2879 break; 2880 case NVPTXISD::Suld2DArrayV2I64Clamp: 2881 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; 2882 break; 2883 case NVPTXISD::Suld2DArrayV4I8Clamp: 2884 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; 2885 break; 2886 case NVPTXISD::Suld2DArrayV4I16Clamp: 2887 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; 2888 break; 2889 case NVPTXISD::Suld2DArrayV4I32Clamp: 2890 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; 2891 break; 2892 case NVPTXISD::Suld3DI8Clamp: 2893 Opc = NVPTX::SULD_3D_I8_CLAMP_R; 2894 break; 2895 case NVPTXISD::Suld3DI16Clamp: 2896 Opc = NVPTX::SULD_3D_I16_CLAMP_R; 2897 break; 2898 case NVPTXISD::Suld3DI32Clamp: 2899 Opc = NVPTX::SULD_3D_I32_CLAMP_R; 2900 break; 2901 case NVPTXISD::Suld3DI64Clamp: 2902 Opc = NVPTX::SULD_3D_I64_CLAMP_R; 2903 break; 2904 case NVPTXISD::Suld3DV2I8Clamp: 2905 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; 2906 break; 2907 case NVPTXISD::Suld3DV2I16Clamp: 2908 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; 2909 break; 2910 case NVPTXISD::Suld3DV2I32Clamp: 2911 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; 2912 break; 2913 case NVPTXISD::Suld3DV2I64Clamp: 2914 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; 2915 break; 2916 case NVPTXISD::Suld3DV4I8Clamp: 2917 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; 2918 break; 2919 case NVPTXISD::Suld3DV4I16Clamp: 2920 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; 2921 break; 2922 case NVPTXISD::Suld3DV4I32Clamp: 2923 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; 2924 break; 2925 case NVPTXISD::Suld1DI8Trap: 2926 Opc = NVPTX::SULD_1D_I8_TRAP_R; 2927 break; 2928 case NVPTXISD::Suld1DI16Trap: 2929 Opc = NVPTX::SULD_1D_I16_TRAP_R; 2930 break; 2931 case NVPTXISD::Suld1DI32Trap: 2932 Opc = NVPTX::SULD_1D_I32_TRAP_R; 2933 break; 2934 case NVPTXISD::Suld1DI64Trap: 2935 Opc = NVPTX::SULD_1D_I64_TRAP_R; 2936 break; 2937 case NVPTXISD::Suld1DV2I8Trap: 2938 Opc = NVPTX::SULD_1D_V2I8_TRAP_R; 2939 break; 2940 case NVPTXISD::Suld1DV2I16Trap: 2941 Opc = NVPTX::SULD_1D_V2I16_TRAP_R; 2942 break; 2943 case NVPTXISD::Suld1DV2I32Trap: 2944 Opc = NVPTX::SULD_1D_V2I32_TRAP_R; 2945 break; 2946 case NVPTXISD::Suld1DV2I64Trap: 2947 Opc = NVPTX::SULD_1D_V2I64_TRAP_R; 2948 break; 2949 case NVPTXISD::Suld1DV4I8Trap: 2950 Opc = NVPTX::SULD_1D_V4I8_TRAP_R; 2951 break; 2952 case NVPTXISD::Suld1DV4I16Trap: 2953 Opc = NVPTX::SULD_1D_V4I16_TRAP_R; 2954 break; 2955 case NVPTXISD::Suld1DV4I32Trap: 2956 Opc = NVPTX::SULD_1D_V4I32_TRAP_R; 2957 break; 2958 case NVPTXISD::Suld1DArrayI8Trap: 2959 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; 2960 break; 2961 case NVPTXISD::Suld1DArrayI16Trap: 2962 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; 2963 break; 2964 case NVPTXISD::Suld1DArrayI32Trap: 2965 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; 2966 break; 2967 case NVPTXISD::Suld1DArrayI64Trap: 2968 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; 2969 break; 2970 case NVPTXISD::Suld1DArrayV2I8Trap: 2971 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; 2972 break; 2973 case NVPTXISD::Suld1DArrayV2I16Trap: 2974 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; 2975 break; 2976 case NVPTXISD::Suld1DArrayV2I32Trap: 2977 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; 2978 break; 2979 case NVPTXISD::Suld1DArrayV2I64Trap: 2980 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; 2981 break; 2982 case NVPTXISD::Suld1DArrayV4I8Trap: 2983 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; 2984 break; 2985 case NVPTXISD::Suld1DArrayV4I16Trap: 2986 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; 2987 break; 2988 case NVPTXISD::Suld1DArrayV4I32Trap: 2989 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; 2990 break; 2991 case NVPTXISD::Suld2DI8Trap: 2992 Opc = NVPTX::SULD_2D_I8_TRAP_R; 2993 break; 2994 case NVPTXISD::Suld2DI16Trap: 2995 Opc = NVPTX::SULD_2D_I16_TRAP_R; 2996 break; 2997 case NVPTXISD::Suld2DI32Trap: 2998 Opc = NVPTX::SULD_2D_I32_TRAP_R; 2999 break; 3000 case NVPTXISD::Suld2DI64Trap: 3001 Opc = NVPTX::SULD_2D_I64_TRAP_R; 3002 break; 3003 case NVPTXISD::Suld2DV2I8Trap: 3004 Opc = NVPTX::SULD_2D_V2I8_TRAP_R; 3005 break; 3006 case NVPTXISD::Suld2DV2I16Trap: 3007 Opc = NVPTX::SULD_2D_V2I16_TRAP_R; 3008 break; 3009 case NVPTXISD::Suld2DV2I32Trap: 3010 Opc = NVPTX::SULD_2D_V2I32_TRAP_R; 3011 break; 3012 case NVPTXISD::Suld2DV2I64Trap: 3013 Opc = NVPTX::SULD_2D_V2I64_TRAP_R; 3014 break; 3015 case NVPTXISD::Suld2DV4I8Trap: 3016 Opc = NVPTX::SULD_2D_V4I8_TRAP_R; 3017 break; 3018 case NVPTXISD::Suld2DV4I16Trap: 3019 Opc = NVPTX::SULD_2D_V4I16_TRAP_R; 3020 break; 3021 case NVPTXISD::Suld2DV4I32Trap: 3022 Opc = NVPTX::SULD_2D_V4I32_TRAP_R; 3023 break; 3024 case NVPTXISD::Suld2DArrayI8Trap: 3025 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; 3026 break; 3027 case NVPTXISD::Suld2DArrayI16Trap: 3028 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; 3029 break; 3030 case NVPTXISD::Suld2DArrayI32Trap: 3031 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; 3032 break; 3033 case NVPTXISD::Suld2DArrayI64Trap: 3034 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; 3035 break; 3036 case NVPTXISD::Suld2DArrayV2I8Trap: 3037 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; 3038 break; 3039 case NVPTXISD::Suld2DArrayV2I16Trap: 3040 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; 3041 break; 3042 case NVPTXISD::Suld2DArrayV2I32Trap: 3043 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; 3044 break; 3045 case NVPTXISD::Suld2DArrayV2I64Trap: 3046 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; 3047 break; 3048 case NVPTXISD::Suld2DArrayV4I8Trap: 3049 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; 3050 break; 3051 case NVPTXISD::Suld2DArrayV4I16Trap: 3052 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; 3053 break; 3054 case NVPTXISD::Suld2DArrayV4I32Trap: 3055 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; 3056 break; 3057 case NVPTXISD::Suld3DI8Trap: 3058 Opc = NVPTX::SULD_3D_I8_TRAP_R; 3059 break; 3060 case NVPTXISD::Suld3DI16Trap: 3061 Opc = NVPTX::SULD_3D_I16_TRAP_R; 3062 break; 3063 case NVPTXISD::Suld3DI32Trap: 3064 Opc = NVPTX::SULD_3D_I32_TRAP_R; 3065 break; 3066 case NVPTXISD::Suld3DI64Trap: 3067 Opc = NVPTX::SULD_3D_I64_TRAP_R; 3068 break; 3069 case NVPTXISD::Suld3DV2I8Trap: 3070 Opc = NVPTX::SULD_3D_V2I8_TRAP_R; 3071 break; 3072 case NVPTXISD::Suld3DV2I16Trap: 3073 Opc = NVPTX::SULD_3D_V2I16_TRAP_R; 3074 break; 3075 case NVPTXISD::Suld3DV2I32Trap: 3076 Opc = NVPTX::SULD_3D_V2I32_TRAP_R; 3077 break; 3078 case NVPTXISD::Suld3DV2I64Trap: 3079 Opc = NVPTX::SULD_3D_V2I64_TRAP_R; 3080 break; 3081 case NVPTXISD::Suld3DV4I8Trap: 3082 Opc = NVPTX::SULD_3D_V4I8_TRAP_R; 3083 break; 3084 case NVPTXISD::Suld3DV4I16Trap: 3085 Opc = NVPTX::SULD_3D_V4I16_TRAP_R; 3086 break; 3087 case NVPTXISD::Suld3DV4I32Trap: 3088 Opc = NVPTX::SULD_3D_V4I32_TRAP_R; 3089 break; 3090 case NVPTXISD::Suld1DI8Zero: 3091 Opc = NVPTX::SULD_1D_I8_ZERO_R; 3092 break; 3093 case NVPTXISD::Suld1DI16Zero: 3094 Opc = NVPTX::SULD_1D_I16_ZERO_R; 3095 break; 3096 case NVPTXISD::Suld1DI32Zero: 3097 Opc = NVPTX::SULD_1D_I32_ZERO_R; 3098 break; 3099 case NVPTXISD::Suld1DI64Zero: 3100 Opc = NVPTX::SULD_1D_I64_ZERO_R; 3101 break; 3102 case NVPTXISD::Suld1DV2I8Zero: 3103 Opc = NVPTX::SULD_1D_V2I8_ZERO_R; 3104 break; 3105 case NVPTXISD::Suld1DV2I16Zero: 3106 Opc = NVPTX::SULD_1D_V2I16_ZERO_R; 3107 break; 3108 case NVPTXISD::Suld1DV2I32Zero: 3109 Opc = NVPTX::SULD_1D_V2I32_ZERO_R; 3110 break; 3111 case NVPTXISD::Suld1DV2I64Zero: 3112 Opc = NVPTX::SULD_1D_V2I64_ZERO_R; 3113 break; 3114 case NVPTXISD::Suld1DV4I8Zero: 3115 Opc = NVPTX::SULD_1D_V4I8_ZERO_R; 3116 break; 3117 case NVPTXISD::Suld1DV4I16Zero: 3118 Opc = NVPTX::SULD_1D_V4I16_ZERO_R; 3119 break; 3120 case NVPTXISD::Suld1DV4I32Zero: 3121 Opc = NVPTX::SULD_1D_V4I32_ZERO_R; 3122 break; 3123 case NVPTXISD::Suld1DArrayI8Zero: 3124 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; 3125 break; 3126 case NVPTXISD::Suld1DArrayI16Zero: 3127 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; 3128 break; 3129 case NVPTXISD::Suld1DArrayI32Zero: 3130 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; 3131 break; 3132 case NVPTXISD::Suld1DArrayI64Zero: 3133 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; 3134 break; 3135 case NVPTXISD::Suld1DArrayV2I8Zero: 3136 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; 3137 break; 3138 case NVPTXISD::Suld1DArrayV2I16Zero: 3139 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; 3140 break; 3141 case NVPTXISD::Suld1DArrayV2I32Zero: 3142 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; 3143 break; 3144 case NVPTXISD::Suld1DArrayV2I64Zero: 3145 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; 3146 break; 3147 case NVPTXISD::Suld1DArrayV4I8Zero: 3148 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; 3149 break; 3150 case NVPTXISD::Suld1DArrayV4I16Zero: 3151 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; 3152 break; 3153 case NVPTXISD::Suld1DArrayV4I32Zero: 3154 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; 3155 break; 3156 case NVPTXISD::Suld2DI8Zero: 3157 Opc = NVPTX::SULD_2D_I8_ZERO_R; 3158 break; 3159 case NVPTXISD::Suld2DI16Zero: 3160 Opc = NVPTX::SULD_2D_I16_ZERO_R; 3161 break; 3162 case NVPTXISD::Suld2DI32Zero: 3163 Opc = NVPTX::SULD_2D_I32_ZERO_R; 3164 break; 3165 case NVPTXISD::Suld2DI64Zero: 3166 Opc = NVPTX::SULD_2D_I64_ZERO_R; 3167 break; 3168 case NVPTXISD::Suld2DV2I8Zero: 3169 Opc = NVPTX::SULD_2D_V2I8_ZERO_R; 3170 break; 3171 case NVPTXISD::Suld2DV2I16Zero: 3172 Opc = NVPTX::SULD_2D_V2I16_ZERO_R; 3173 break; 3174 case NVPTXISD::Suld2DV2I32Zero: 3175 Opc = NVPTX::SULD_2D_V2I32_ZERO_R; 3176 break; 3177 case NVPTXISD::Suld2DV2I64Zero: 3178 Opc = NVPTX::SULD_2D_V2I64_ZERO_R; 3179 break; 3180 case NVPTXISD::Suld2DV4I8Zero: 3181 Opc = NVPTX::SULD_2D_V4I8_ZERO_R; 3182 break; 3183 case NVPTXISD::Suld2DV4I16Zero: 3184 Opc = NVPTX::SULD_2D_V4I16_ZERO_R; 3185 break; 3186 case NVPTXISD::Suld2DV4I32Zero: 3187 Opc = NVPTX::SULD_2D_V4I32_ZERO_R; 3188 break; 3189 case NVPTXISD::Suld2DArrayI8Zero: 3190 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; 3191 break; 3192 case NVPTXISD::Suld2DArrayI16Zero: 3193 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; 3194 break; 3195 case NVPTXISD::Suld2DArrayI32Zero: 3196 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; 3197 break; 3198 case NVPTXISD::Suld2DArrayI64Zero: 3199 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; 3200 break; 3201 case NVPTXISD::Suld2DArrayV2I8Zero: 3202 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; 3203 break; 3204 case NVPTXISD::Suld2DArrayV2I16Zero: 3205 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; 3206 break; 3207 case NVPTXISD::Suld2DArrayV2I32Zero: 3208 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; 3209 break; 3210 case NVPTXISD::Suld2DArrayV2I64Zero: 3211 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; 3212 break; 3213 case NVPTXISD::Suld2DArrayV4I8Zero: 3214 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; 3215 break; 3216 case NVPTXISD::Suld2DArrayV4I16Zero: 3217 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; 3218 break; 3219 case NVPTXISD::Suld2DArrayV4I32Zero: 3220 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; 3221 break; 3222 case NVPTXISD::Suld3DI8Zero: 3223 Opc = NVPTX::SULD_3D_I8_ZERO_R; 3224 break; 3225 case NVPTXISD::Suld3DI16Zero: 3226 Opc = NVPTX::SULD_3D_I16_ZERO_R; 3227 break; 3228 case NVPTXISD::Suld3DI32Zero: 3229 Opc = NVPTX::SULD_3D_I32_ZERO_R; 3230 break; 3231 case NVPTXISD::Suld3DI64Zero: 3232 Opc = NVPTX::SULD_3D_I64_ZERO_R; 3233 break; 3234 case NVPTXISD::Suld3DV2I8Zero: 3235 Opc = NVPTX::SULD_3D_V2I8_ZERO_R; 3236 break; 3237 case NVPTXISD::Suld3DV2I16Zero: 3238 Opc = NVPTX::SULD_3D_V2I16_ZERO_R; 3239 break; 3240 case NVPTXISD::Suld3DV2I32Zero: 3241 Opc = NVPTX::SULD_3D_V2I32_ZERO_R; 3242 break; 3243 case NVPTXISD::Suld3DV2I64Zero: 3244 Opc = NVPTX::SULD_3D_V2I64_ZERO_R; 3245 break; 3246 case NVPTXISD::Suld3DV4I8Zero: 3247 Opc = NVPTX::SULD_3D_V4I8_ZERO_R; 3248 break; 3249 case NVPTXISD::Suld3DV4I16Zero: 3250 Opc = NVPTX::SULD_3D_V4I16_ZERO_R; 3251 break; 3252 case NVPTXISD::Suld3DV4I32Zero: 3253 Opc = NVPTX::SULD_3D_V4I32_ZERO_R; 3254 break; 3255 } 3256 3257 // Copy over operands 3258 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 3259 Ops.push_back(N->getOperand(0)); // Move chain to the back. 3260 3261 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 3262 return true; 3263 } 3264 3265 3266 /// SelectBFE - Look for instruction sequences that can be made more efficient 3267 /// by using the 'bfe' (bit-field extract) PTX instruction 3268 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { 3269 SDLoc DL(N); 3270 SDValue LHS = N->getOperand(0); 3271 SDValue RHS = N->getOperand(1); 3272 SDValue Len; 3273 SDValue Start; 3274 SDValue Val; 3275 bool IsSigned = false; 3276 3277 if (N->getOpcode() == ISD::AND) { 3278 // Canonicalize the operands 3279 // We want 'and %val, %mask' 3280 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { 3281 std::swap(LHS, RHS); 3282 } 3283 3284 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); 3285 if (!Mask) { 3286 // We need a constant mask on the RHS of the AND 3287 return false; 3288 } 3289 3290 // Extract the mask bits 3291 uint64_t MaskVal = Mask->getZExtValue(); 3292 if (!isMask_64(MaskVal)) { 3293 // We *could* handle shifted masks here, but doing so would require an 3294 // 'and' operation to fix up the low-order bits so we would trade 3295 // shr+and for bfe+and, which has the same throughput 3296 return false; 3297 } 3298 3299 // How many bits are in our mask? 3300 int64_t NumBits = countr_one(MaskVal); 3301 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3302 3303 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { 3304 // We have a 'srl/and' pair, extract the effective start bit and length 3305 Val = LHS.getNode()->getOperand(0); 3306 Start = LHS.getNode()->getOperand(1); 3307 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); 3308 if (StartConst) { 3309 uint64_t StartVal = StartConst->getZExtValue(); 3310 // How many "good" bits do we have left? "good" is defined here as bits 3311 // that exist in the original value, not shifted in. 3312 int64_t GoodBits = Start.getValueSizeInBits() - StartVal; 3313 if (NumBits > GoodBits) { 3314 // Do not handle the case where bits have been shifted in. In theory 3315 // we could handle this, but the cost is likely higher than just 3316 // emitting the srl/and pair. 3317 return false; 3318 } 3319 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); 3320 } else { 3321 // Do not handle the case where the shift amount (can be zero if no srl 3322 // was found) is not constant. We could handle this case, but it would 3323 // require run-time logic that would be more expensive than just 3324 // emitting the srl/and pair. 3325 return false; 3326 } 3327 } else { 3328 // Do not handle the case where the LHS of the and is not a shift. While 3329 // it would be trivial to handle this case, it would just transform 3330 // 'and' -> 'bfe', but 'and' has higher-throughput. 3331 return false; 3332 } 3333 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { 3334 if (LHS->getOpcode() == ISD::AND) { 3335 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); 3336 if (!ShiftCnst) { 3337 // Shift amount must be constant 3338 return false; 3339 } 3340 3341 uint64_t ShiftAmt = ShiftCnst->getZExtValue(); 3342 3343 SDValue AndLHS = LHS->getOperand(0); 3344 SDValue AndRHS = LHS->getOperand(1); 3345 3346 // Canonicalize the AND to have the mask on the RHS 3347 if (isa<ConstantSDNode>(AndLHS)) { 3348 std::swap(AndLHS, AndRHS); 3349 } 3350 3351 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); 3352 if (!MaskCnst) { 3353 // Mask must be constant 3354 return false; 3355 } 3356 3357 uint64_t MaskVal = MaskCnst->getZExtValue(); 3358 uint64_t NumZeros; 3359 uint64_t NumBits; 3360 if (isMask_64(MaskVal)) { 3361 NumZeros = 0; 3362 // The number of bits in the result bitfield will be the number of 3363 // trailing ones (the AND) minus the number of bits we shift off 3364 NumBits = llvm::countr_one(MaskVal) - ShiftAmt; 3365 } else if (isShiftedMask_64(MaskVal)) { 3366 NumZeros = llvm::countr_zero(MaskVal); 3367 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros); 3368 // The number of bits in the result bitfield will be the number of 3369 // trailing zeros plus the number of set bits in the mask minus the 3370 // number of bits we shift off 3371 NumBits = NumZeros + NumOnes - ShiftAmt; 3372 } else { 3373 // This is not a mask we can handle 3374 return false; 3375 } 3376 3377 if (ShiftAmt < NumZeros) { 3378 // Handling this case would require extra logic that would make this 3379 // transformation non-profitable 3380 return false; 3381 } 3382 3383 Val = AndLHS; 3384 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); 3385 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3386 } else if (LHS->getOpcode() == ISD::SHL) { 3387 // Here, we have a pattern like: 3388 // 3389 // (sra (shl val, NN), MM) 3390 // or 3391 // (srl (shl val, NN), MM) 3392 // 3393 // If MM >= NN, we can efficiently optimize this with bfe 3394 Val = LHS->getOperand(0); 3395 3396 SDValue ShlRHS = LHS->getOperand(1); 3397 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); 3398 if (!ShlCnst) { 3399 // Shift amount must be constant 3400 return false; 3401 } 3402 uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); 3403 3404 SDValue ShrRHS = RHS; 3405 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); 3406 if (!ShrCnst) { 3407 // Shift amount must be constant 3408 return false; 3409 } 3410 uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); 3411 3412 // To avoid extra codegen and be profitable, we need Outer >= Inner 3413 if (OuterShiftAmt < InnerShiftAmt) { 3414 return false; 3415 } 3416 3417 // If the outer shift is more than the type size, we have no bitfield to 3418 // extract (since we also check that the inner shift is <= the outer shift 3419 // then this also implies that the inner shift is < the type size) 3420 if (OuterShiftAmt >= Val.getValueSizeInBits()) { 3421 return false; 3422 } 3423 3424 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, 3425 MVT::i32); 3426 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, 3427 DL, MVT::i32); 3428 3429 if (N->getOpcode() == ISD::SRA) { 3430 // If we have a arithmetic right shift, we need to use the signed bfe 3431 // variant 3432 IsSigned = true; 3433 } 3434 } else { 3435 // No can do... 3436 return false; 3437 } 3438 } else { 3439 // No can do... 3440 return false; 3441 } 3442 3443 3444 unsigned Opc; 3445 // For the BFE operations we form here from "and" and "srl", always use the 3446 // unsigned variants. 3447 if (Val.getValueType() == MVT::i32) { 3448 if (IsSigned) { 3449 Opc = NVPTX::BFE_S32rii; 3450 } else { 3451 Opc = NVPTX::BFE_U32rii; 3452 } 3453 } else if (Val.getValueType() == MVT::i64) { 3454 if (IsSigned) { 3455 Opc = NVPTX::BFE_S64rii; 3456 } else { 3457 Opc = NVPTX::BFE_U64rii; 3458 } 3459 } else { 3460 // We cannot handle this type 3461 return false; 3462 } 3463 3464 SDValue Ops[] = { 3465 Val, Start, Len 3466 }; 3467 3468 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); 3469 return true; 3470 } 3471 3472 // SelectDirectAddr - Match a direct address for DAG. 3473 // A direct address could be a globaladdress or externalsymbol. 3474 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { 3475 // Return true if TGA or ES. 3476 if (N.getOpcode() == ISD::TargetGlobalAddress || 3477 N.getOpcode() == ISD::TargetExternalSymbol) { 3478 Address = N; 3479 return true; 3480 } 3481 if (N.getOpcode() == NVPTXISD::Wrapper) { 3482 Address = N.getOperand(0); 3483 return true; 3484 } 3485 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol 3486 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) { 3487 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && 3488 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && 3489 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) 3490 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); 3491 } 3492 return false; 3493 } 3494 3495 // symbol+offset 3496 bool NVPTXDAGToDAGISel::SelectADDRsi_imp( 3497 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3498 if (Addr.getOpcode() == ISD::ADD) { 3499 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3500 SDValue base = Addr.getOperand(0); 3501 if (SelectDirectAddr(base, Base)) { 3502 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3503 mvt); 3504 return true; 3505 } 3506 } 3507 } 3508 return false; 3509 } 3510 3511 // symbol+offset 3512 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, 3513 SDValue &Base, SDValue &Offset) { 3514 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); 3515 } 3516 3517 // symbol+offset 3518 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, 3519 SDValue &Base, SDValue &Offset) { 3520 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); 3521 } 3522 3523 // register+offset 3524 bool NVPTXDAGToDAGISel::SelectADDRri_imp( 3525 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3526 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { 3527 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3528 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt); 3529 return true; 3530 } 3531 if (Addr.getOpcode() == ISD::TargetExternalSymbol || 3532 Addr.getOpcode() == ISD::TargetGlobalAddress) 3533 return false; // direct calls. 3534 3535 if (Addr.getOpcode() == ISD::ADD) { 3536 if (SelectDirectAddr(Addr.getOperand(0), Addr)) { 3537 return false; 3538 } 3539 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3540 if (FrameIndexSDNode *FIN = 3541 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) 3542 // Constant offset from frame ref. 3543 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3544 else 3545 Base = Addr.getOperand(0); 3546 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3547 mvt); 3548 return true; 3549 } 3550 } 3551 return false; 3552 } 3553 3554 // register+offset 3555 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, 3556 SDValue &Base, SDValue &Offset) { 3557 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); 3558 } 3559 3560 // register+offset 3561 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, 3562 SDValue &Base, SDValue &Offset) { 3563 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); 3564 } 3565 3566 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, 3567 unsigned int spN) const { 3568 const Value *Src = nullptr; 3569 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { 3570 if (spN == 0 && mN->getMemOperand()->getPseudoValue()) 3571 return true; 3572 Src = mN->getMemOperand()->getValue(); 3573 } 3574 if (!Src) 3575 return false; 3576 if (auto *PT = dyn_cast<PointerType>(Src->getType())) 3577 return (PT->getAddressSpace() == spN); 3578 return false; 3579 } 3580 3581 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 3582 /// inline asm expressions. 3583 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( 3584 const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { 3585 SDValue Op0, Op1; 3586 switch (ConstraintID) { 3587 default: 3588 return true; 3589 case InlineAsm::Constraint_m: // memory 3590 if (SelectDirectAddr(Op, Op0)) { 3591 OutOps.push_back(Op0); 3592 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); 3593 return false; 3594 } 3595 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { 3596 OutOps.push_back(Op0); 3597 OutOps.push_back(Op1); 3598 return false; 3599 } 3600 break; 3601 } 3602 return true; 3603 } 3604 3605 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a 3606 /// conversion from \p SrcTy to \p DestTy. 3607 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, 3608 LoadSDNode *LdNode) { 3609 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; 3610 switch (SrcTy.SimpleTy) { 3611 default: 3612 llvm_unreachable("Unhandled source type"); 3613 case MVT::i8: 3614 switch (DestTy.SimpleTy) { 3615 default: 3616 llvm_unreachable("Unhandled dest type"); 3617 case MVT::i16: 3618 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; 3619 case MVT::i32: 3620 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; 3621 case MVT::i64: 3622 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; 3623 } 3624 case MVT::i16: 3625 switch (DestTy.SimpleTy) { 3626 default: 3627 llvm_unreachable("Unhandled dest type"); 3628 case MVT::i8: 3629 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; 3630 case MVT::i32: 3631 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; 3632 case MVT::i64: 3633 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; 3634 } 3635 case MVT::i32: 3636 switch (DestTy.SimpleTy) { 3637 default: 3638 llvm_unreachable("Unhandled dest type"); 3639 case MVT::i8: 3640 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; 3641 case MVT::i16: 3642 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; 3643 case MVT::i64: 3644 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; 3645 } 3646 case MVT::i64: 3647 switch (DestTy.SimpleTy) { 3648 default: 3649 llvm_unreachable("Unhandled dest type"); 3650 case MVT::i8: 3651 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; 3652 case MVT::i16: 3653 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; 3654 case MVT::i32: 3655 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; 3656 } 3657 case MVT::f16: 3658 switch (DestTy.SimpleTy) { 3659 default: 3660 llvm_unreachable("Unhandled dest type"); 3661 case MVT::f32: 3662 return NVPTX::CVT_f32_f16; 3663 case MVT::f64: 3664 return NVPTX::CVT_f64_f16; 3665 } 3666 } 3667 } 3668