1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines an instruction selector for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXISelDAGToDAG.h" 14 #include "MCTargetDesc/NVPTXBaseInfo.h" 15 #include "NVPTXUtilities.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/CodeGen/ISDOpcodes.h" 18 #include "llvm/IR/GlobalValue.h" 19 #include "llvm/IR/Instructions.h" 20 #include "llvm/IR/IntrinsicsNVPTX.h" 21 #include "llvm/Support/AtomicOrdering.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/Debug.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include "llvm/Target/TargetIntrinsicInfo.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "nvptx-isel" 31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection" 32 33 static cl::opt<bool> 34 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, 35 cl::desc("Enable reciprocal sqrt optimization")); 36 37 /// createNVPTXISelDag - This pass converts a legalized DAG into a 38 /// NVPTX-specific DAG, ready for instruction scheduling. 39 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, 40 llvm::CodeGenOptLevel OptLevel) { 41 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel); 42 } 43 44 NVPTXDAGToDAGISelLegacy::NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, 45 CodeGenOptLevel OptLevel) 46 : SelectionDAGISelLegacy( 47 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {} 48 49 char NVPTXDAGToDAGISelLegacy::ID = 0; 50 51 INITIALIZE_PASS(NVPTXDAGToDAGISelLegacy, DEBUG_TYPE, PASS_NAME, false, false) 52 53 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, 54 CodeGenOptLevel OptLevel) 55 : SelectionDAGISel(tm, OptLevel), TM(tm) { 56 doMulWide = (OptLevel > CodeGenOptLevel::None); 57 } 58 59 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 60 Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); 61 return SelectionDAGISel::runOnMachineFunction(MF); 62 } 63 64 int NVPTXDAGToDAGISel::getDivF32Level() const { 65 return Subtarget->getTargetLowering()->getDivF32Level(); 66 } 67 68 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { 69 return Subtarget->getTargetLowering()->usePrecSqrtF32(); 70 } 71 72 bool NVPTXDAGToDAGISel::useF32FTZ() const { 73 return Subtarget->getTargetLowering()->useF32FTZ(*MF); 74 } 75 76 bool NVPTXDAGToDAGISel::allowFMA() const { 77 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 78 return TL->allowFMA(*MF, OptLevel); 79 } 80 81 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { 82 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 83 return TL->allowUnsafeFPMath(*MF); 84 } 85 86 bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } 87 88 /// Select - Select instructions not customized! Used for 89 /// expanded, promoted and normal instructions. 90 void NVPTXDAGToDAGISel::Select(SDNode *N) { 91 92 if (N->isMachineOpcode()) { 93 N->setNodeId(-1); 94 return; // Already selected. 95 } 96 97 switch (N->getOpcode()) { 98 case ISD::LOAD: 99 case ISD::ATOMIC_LOAD: 100 if (tryLoad(N)) 101 return; 102 break; 103 case ISD::STORE: 104 case ISD::ATOMIC_STORE: 105 if (tryStore(N)) 106 return; 107 break; 108 case ISD::EXTRACT_VECTOR_ELT: 109 if (tryEXTRACT_VECTOR_ELEMENT(N)) 110 return; 111 break; 112 case NVPTXISD::SETP_F16X2: 113 SelectSETP_F16X2(N); 114 return; 115 case NVPTXISD::SETP_BF16X2: 116 SelectSETP_BF16X2(N); 117 return; 118 case NVPTXISD::LoadV2: 119 case NVPTXISD::LoadV4: 120 if (tryLoadVector(N)) 121 return; 122 break; 123 case NVPTXISD::LDGV2: 124 case NVPTXISD::LDGV4: 125 case NVPTXISD::LDUV2: 126 case NVPTXISD::LDUV4: 127 if (tryLDGLDU(N)) 128 return; 129 break; 130 case NVPTXISD::StoreV2: 131 case NVPTXISD::StoreV4: 132 if (tryStoreVector(N)) 133 return; 134 break; 135 case NVPTXISD::LoadParam: 136 case NVPTXISD::LoadParamV2: 137 case NVPTXISD::LoadParamV4: 138 if (tryLoadParam(N)) 139 return; 140 break; 141 case NVPTXISD::StoreRetval: 142 case NVPTXISD::StoreRetvalV2: 143 case NVPTXISD::StoreRetvalV4: 144 if (tryStoreRetval(N)) 145 return; 146 break; 147 case NVPTXISD::StoreParam: 148 case NVPTXISD::StoreParamV2: 149 case NVPTXISD::StoreParamV4: 150 case NVPTXISD::StoreParamS32: 151 case NVPTXISD::StoreParamU32: 152 if (tryStoreParam(N)) 153 return; 154 break; 155 case ISD::INTRINSIC_WO_CHAIN: 156 if (tryIntrinsicNoChain(N)) 157 return; 158 break; 159 case ISD::INTRINSIC_W_CHAIN: 160 if (tryIntrinsicChain(N)) 161 return; 162 break; 163 case NVPTXISD::Tex1DFloatS32: 164 case NVPTXISD::Tex1DFloatFloat: 165 case NVPTXISD::Tex1DFloatFloatLevel: 166 case NVPTXISD::Tex1DFloatFloatGrad: 167 case NVPTXISD::Tex1DS32S32: 168 case NVPTXISD::Tex1DS32Float: 169 case NVPTXISD::Tex1DS32FloatLevel: 170 case NVPTXISD::Tex1DS32FloatGrad: 171 case NVPTXISD::Tex1DU32S32: 172 case NVPTXISD::Tex1DU32Float: 173 case NVPTXISD::Tex1DU32FloatLevel: 174 case NVPTXISD::Tex1DU32FloatGrad: 175 case NVPTXISD::Tex1DArrayFloatS32: 176 case NVPTXISD::Tex1DArrayFloatFloat: 177 case NVPTXISD::Tex1DArrayFloatFloatLevel: 178 case NVPTXISD::Tex1DArrayFloatFloatGrad: 179 case NVPTXISD::Tex1DArrayS32S32: 180 case NVPTXISD::Tex1DArrayS32Float: 181 case NVPTXISD::Tex1DArrayS32FloatLevel: 182 case NVPTXISD::Tex1DArrayS32FloatGrad: 183 case NVPTXISD::Tex1DArrayU32S32: 184 case NVPTXISD::Tex1DArrayU32Float: 185 case NVPTXISD::Tex1DArrayU32FloatLevel: 186 case NVPTXISD::Tex1DArrayU32FloatGrad: 187 case NVPTXISD::Tex2DFloatS32: 188 case NVPTXISD::Tex2DFloatFloat: 189 case NVPTXISD::Tex2DFloatFloatLevel: 190 case NVPTXISD::Tex2DFloatFloatGrad: 191 case NVPTXISD::Tex2DS32S32: 192 case NVPTXISD::Tex2DS32Float: 193 case NVPTXISD::Tex2DS32FloatLevel: 194 case NVPTXISD::Tex2DS32FloatGrad: 195 case NVPTXISD::Tex2DU32S32: 196 case NVPTXISD::Tex2DU32Float: 197 case NVPTXISD::Tex2DU32FloatLevel: 198 case NVPTXISD::Tex2DU32FloatGrad: 199 case NVPTXISD::Tex2DArrayFloatS32: 200 case NVPTXISD::Tex2DArrayFloatFloat: 201 case NVPTXISD::Tex2DArrayFloatFloatLevel: 202 case NVPTXISD::Tex2DArrayFloatFloatGrad: 203 case NVPTXISD::Tex2DArrayS32S32: 204 case NVPTXISD::Tex2DArrayS32Float: 205 case NVPTXISD::Tex2DArrayS32FloatLevel: 206 case NVPTXISD::Tex2DArrayS32FloatGrad: 207 case NVPTXISD::Tex2DArrayU32S32: 208 case NVPTXISD::Tex2DArrayU32Float: 209 case NVPTXISD::Tex2DArrayU32FloatLevel: 210 case NVPTXISD::Tex2DArrayU32FloatGrad: 211 case NVPTXISD::Tex3DFloatS32: 212 case NVPTXISD::Tex3DFloatFloat: 213 case NVPTXISD::Tex3DFloatFloatLevel: 214 case NVPTXISD::Tex3DFloatFloatGrad: 215 case NVPTXISD::Tex3DS32S32: 216 case NVPTXISD::Tex3DS32Float: 217 case NVPTXISD::Tex3DS32FloatLevel: 218 case NVPTXISD::Tex3DS32FloatGrad: 219 case NVPTXISD::Tex3DU32S32: 220 case NVPTXISD::Tex3DU32Float: 221 case NVPTXISD::Tex3DU32FloatLevel: 222 case NVPTXISD::Tex3DU32FloatGrad: 223 case NVPTXISD::TexCubeFloatFloat: 224 case NVPTXISD::TexCubeFloatFloatLevel: 225 case NVPTXISD::TexCubeS32Float: 226 case NVPTXISD::TexCubeS32FloatLevel: 227 case NVPTXISD::TexCubeU32Float: 228 case NVPTXISD::TexCubeU32FloatLevel: 229 case NVPTXISD::TexCubeArrayFloatFloat: 230 case NVPTXISD::TexCubeArrayFloatFloatLevel: 231 case NVPTXISD::TexCubeArrayS32Float: 232 case NVPTXISD::TexCubeArrayS32FloatLevel: 233 case NVPTXISD::TexCubeArrayU32Float: 234 case NVPTXISD::TexCubeArrayU32FloatLevel: 235 case NVPTXISD::Tld4R2DFloatFloat: 236 case NVPTXISD::Tld4G2DFloatFloat: 237 case NVPTXISD::Tld4B2DFloatFloat: 238 case NVPTXISD::Tld4A2DFloatFloat: 239 case NVPTXISD::Tld4R2DS64Float: 240 case NVPTXISD::Tld4G2DS64Float: 241 case NVPTXISD::Tld4B2DS64Float: 242 case NVPTXISD::Tld4A2DS64Float: 243 case NVPTXISD::Tld4R2DU64Float: 244 case NVPTXISD::Tld4G2DU64Float: 245 case NVPTXISD::Tld4B2DU64Float: 246 case NVPTXISD::Tld4A2DU64Float: 247 case NVPTXISD::TexUnified1DFloatS32: 248 case NVPTXISD::TexUnified1DFloatFloat: 249 case NVPTXISD::TexUnified1DFloatFloatLevel: 250 case NVPTXISD::TexUnified1DFloatFloatGrad: 251 case NVPTXISD::TexUnified1DS32S32: 252 case NVPTXISD::TexUnified1DS32Float: 253 case NVPTXISD::TexUnified1DS32FloatLevel: 254 case NVPTXISD::TexUnified1DS32FloatGrad: 255 case NVPTXISD::TexUnified1DU32S32: 256 case NVPTXISD::TexUnified1DU32Float: 257 case NVPTXISD::TexUnified1DU32FloatLevel: 258 case NVPTXISD::TexUnified1DU32FloatGrad: 259 case NVPTXISD::TexUnified1DArrayFloatS32: 260 case NVPTXISD::TexUnified1DArrayFloatFloat: 261 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 262 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 263 case NVPTXISD::TexUnified1DArrayS32S32: 264 case NVPTXISD::TexUnified1DArrayS32Float: 265 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 266 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 267 case NVPTXISD::TexUnified1DArrayU32S32: 268 case NVPTXISD::TexUnified1DArrayU32Float: 269 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 270 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 271 case NVPTXISD::TexUnified2DFloatS32: 272 case NVPTXISD::TexUnified2DFloatFloat: 273 case NVPTXISD::TexUnified2DFloatFloatLevel: 274 case NVPTXISD::TexUnified2DFloatFloatGrad: 275 case NVPTXISD::TexUnified2DS32S32: 276 case NVPTXISD::TexUnified2DS32Float: 277 case NVPTXISD::TexUnified2DS32FloatLevel: 278 case NVPTXISD::TexUnified2DS32FloatGrad: 279 case NVPTXISD::TexUnified2DU32S32: 280 case NVPTXISD::TexUnified2DU32Float: 281 case NVPTXISD::TexUnified2DU32FloatLevel: 282 case NVPTXISD::TexUnified2DU32FloatGrad: 283 case NVPTXISD::TexUnified2DArrayFloatS32: 284 case NVPTXISD::TexUnified2DArrayFloatFloat: 285 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 286 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 287 case NVPTXISD::TexUnified2DArrayS32S32: 288 case NVPTXISD::TexUnified2DArrayS32Float: 289 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 290 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 291 case NVPTXISD::TexUnified2DArrayU32S32: 292 case NVPTXISD::TexUnified2DArrayU32Float: 293 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 294 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 295 case NVPTXISD::TexUnified3DFloatS32: 296 case NVPTXISD::TexUnified3DFloatFloat: 297 case NVPTXISD::TexUnified3DFloatFloatLevel: 298 case NVPTXISD::TexUnified3DFloatFloatGrad: 299 case NVPTXISD::TexUnified3DS32S32: 300 case NVPTXISD::TexUnified3DS32Float: 301 case NVPTXISD::TexUnified3DS32FloatLevel: 302 case NVPTXISD::TexUnified3DS32FloatGrad: 303 case NVPTXISD::TexUnified3DU32S32: 304 case NVPTXISD::TexUnified3DU32Float: 305 case NVPTXISD::TexUnified3DU32FloatLevel: 306 case NVPTXISD::TexUnified3DU32FloatGrad: 307 case NVPTXISD::TexUnifiedCubeFloatFloat: 308 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 309 case NVPTXISD::TexUnifiedCubeS32Float: 310 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 311 case NVPTXISD::TexUnifiedCubeU32Float: 312 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 313 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 314 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 315 case NVPTXISD::TexUnifiedCubeArrayS32Float: 316 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 317 case NVPTXISD::TexUnifiedCubeArrayU32Float: 318 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 319 case NVPTXISD::TexUnifiedCubeFloatFloatGrad: 320 case NVPTXISD::TexUnifiedCubeS32FloatGrad: 321 case NVPTXISD::TexUnifiedCubeU32FloatGrad: 322 case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: 323 case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: 324 case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: 325 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 326 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 327 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 328 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 329 case NVPTXISD::Tld4UnifiedR2DS64Float: 330 case NVPTXISD::Tld4UnifiedG2DS64Float: 331 case NVPTXISD::Tld4UnifiedB2DS64Float: 332 case NVPTXISD::Tld4UnifiedA2DS64Float: 333 case NVPTXISD::Tld4UnifiedR2DU64Float: 334 case NVPTXISD::Tld4UnifiedG2DU64Float: 335 case NVPTXISD::Tld4UnifiedB2DU64Float: 336 case NVPTXISD::Tld4UnifiedA2DU64Float: 337 if (tryTextureIntrinsic(N)) 338 return; 339 break; 340 case NVPTXISD::Suld1DI8Clamp: 341 case NVPTXISD::Suld1DI16Clamp: 342 case NVPTXISD::Suld1DI32Clamp: 343 case NVPTXISD::Suld1DI64Clamp: 344 case NVPTXISD::Suld1DV2I8Clamp: 345 case NVPTXISD::Suld1DV2I16Clamp: 346 case NVPTXISD::Suld1DV2I32Clamp: 347 case NVPTXISD::Suld1DV2I64Clamp: 348 case NVPTXISD::Suld1DV4I8Clamp: 349 case NVPTXISD::Suld1DV4I16Clamp: 350 case NVPTXISD::Suld1DV4I32Clamp: 351 case NVPTXISD::Suld1DArrayI8Clamp: 352 case NVPTXISD::Suld1DArrayI16Clamp: 353 case NVPTXISD::Suld1DArrayI32Clamp: 354 case NVPTXISD::Suld1DArrayI64Clamp: 355 case NVPTXISD::Suld1DArrayV2I8Clamp: 356 case NVPTXISD::Suld1DArrayV2I16Clamp: 357 case NVPTXISD::Suld1DArrayV2I32Clamp: 358 case NVPTXISD::Suld1DArrayV2I64Clamp: 359 case NVPTXISD::Suld1DArrayV4I8Clamp: 360 case NVPTXISD::Suld1DArrayV4I16Clamp: 361 case NVPTXISD::Suld1DArrayV4I32Clamp: 362 case NVPTXISD::Suld2DI8Clamp: 363 case NVPTXISD::Suld2DI16Clamp: 364 case NVPTXISD::Suld2DI32Clamp: 365 case NVPTXISD::Suld2DI64Clamp: 366 case NVPTXISD::Suld2DV2I8Clamp: 367 case NVPTXISD::Suld2DV2I16Clamp: 368 case NVPTXISD::Suld2DV2I32Clamp: 369 case NVPTXISD::Suld2DV2I64Clamp: 370 case NVPTXISD::Suld2DV4I8Clamp: 371 case NVPTXISD::Suld2DV4I16Clamp: 372 case NVPTXISD::Suld2DV4I32Clamp: 373 case NVPTXISD::Suld2DArrayI8Clamp: 374 case NVPTXISD::Suld2DArrayI16Clamp: 375 case NVPTXISD::Suld2DArrayI32Clamp: 376 case NVPTXISD::Suld2DArrayI64Clamp: 377 case NVPTXISD::Suld2DArrayV2I8Clamp: 378 case NVPTXISD::Suld2DArrayV2I16Clamp: 379 case NVPTXISD::Suld2DArrayV2I32Clamp: 380 case NVPTXISD::Suld2DArrayV2I64Clamp: 381 case NVPTXISD::Suld2DArrayV4I8Clamp: 382 case NVPTXISD::Suld2DArrayV4I16Clamp: 383 case NVPTXISD::Suld2DArrayV4I32Clamp: 384 case NVPTXISD::Suld3DI8Clamp: 385 case NVPTXISD::Suld3DI16Clamp: 386 case NVPTXISD::Suld3DI32Clamp: 387 case NVPTXISD::Suld3DI64Clamp: 388 case NVPTXISD::Suld3DV2I8Clamp: 389 case NVPTXISD::Suld3DV2I16Clamp: 390 case NVPTXISD::Suld3DV2I32Clamp: 391 case NVPTXISD::Suld3DV2I64Clamp: 392 case NVPTXISD::Suld3DV4I8Clamp: 393 case NVPTXISD::Suld3DV4I16Clamp: 394 case NVPTXISD::Suld3DV4I32Clamp: 395 case NVPTXISD::Suld1DI8Trap: 396 case NVPTXISD::Suld1DI16Trap: 397 case NVPTXISD::Suld1DI32Trap: 398 case NVPTXISD::Suld1DI64Trap: 399 case NVPTXISD::Suld1DV2I8Trap: 400 case NVPTXISD::Suld1DV2I16Trap: 401 case NVPTXISD::Suld1DV2I32Trap: 402 case NVPTXISD::Suld1DV2I64Trap: 403 case NVPTXISD::Suld1DV4I8Trap: 404 case NVPTXISD::Suld1DV4I16Trap: 405 case NVPTXISD::Suld1DV4I32Trap: 406 case NVPTXISD::Suld1DArrayI8Trap: 407 case NVPTXISD::Suld1DArrayI16Trap: 408 case NVPTXISD::Suld1DArrayI32Trap: 409 case NVPTXISD::Suld1DArrayI64Trap: 410 case NVPTXISD::Suld1DArrayV2I8Trap: 411 case NVPTXISD::Suld1DArrayV2I16Trap: 412 case NVPTXISD::Suld1DArrayV2I32Trap: 413 case NVPTXISD::Suld1DArrayV2I64Trap: 414 case NVPTXISD::Suld1DArrayV4I8Trap: 415 case NVPTXISD::Suld1DArrayV4I16Trap: 416 case NVPTXISD::Suld1DArrayV4I32Trap: 417 case NVPTXISD::Suld2DI8Trap: 418 case NVPTXISD::Suld2DI16Trap: 419 case NVPTXISD::Suld2DI32Trap: 420 case NVPTXISD::Suld2DI64Trap: 421 case NVPTXISD::Suld2DV2I8Trap: 422 case NVPTXISD::Suld2DV2I16Trap: 423 case NVPTXISD::Suld2DV2I32Trap: 424 case NVPTXISD::Suld2DV2I64Trap: 425 case NVPTXISD::Suld2DV4I8Trap: 426 case NVPTXISD::Suld2DV4I16Trap: 427 case NVPTXISD::Suld2DV4I32Trap: 428 case NVPTXISD::Suld2DArrayI8Trap: 429 case NVPTXISD::Suld2DArrayI16Trap: 430 case NVPTXISD::Suld2DArrayI32Trap: 431 case NVPTXISD::Suld2DArrayI64Trap: 432 case NVPTXISD::Suld2DArrayV2I8Trap: 433 case NVPTXISD::Suld2DArrayV2I16Trap: 434 case NVPTXISD::Suld2DArrayV2I32Trap: 435 case NVPTXISD::Suld2DArrayV2I64Trap: 436 case NVPTXISD::Suld2DArrayV4I8Trap: 437 case NVPTXISD::Suld2DArrayV4I16Trap: 438 case NVPTXISD::Suld2DArrayV4I32Trap: 439 case NVPTXISD::Suld3DI8Trap: 440 case NVPTXISD::Suld3DI16Trap: 441 case NVPTXISD::Suld3DI32Trap: 442 case NVPTXISD::Suld3DI64Trap: 443 case NVPTXISD::Suld3DV2I8Trap: 444 case NVPTXISD::Suld3DV2I16Trap: 445 case NVPTXISD::Suld3DV2I32Trap: 446 case NVPTXISD::Suld3DV2I64Trap: 447 case NVPTXISD::Suld3DV4I8Trap: 448 case NVPTXISD::Suld3DV4I16Trap: 449 case NVPTXISD::Suld3DV4I32Trap: 450 case NVPTXISD::Suld1DI8Zero: 451 case NVPTXISD::Suld1DI16Zero: 452 case NVPTXISD::Suld1DI32Zero: 453 case NVPTXISD::Suld1DI64Zero: 454 case NVPTXISD::Suld1DV2I8Zero: 455 case NVPTXISD::Suld1DV2I16Zero: 456 case NVPTXISD::Suld1DV2I32Zero: 457 case NVPTXISD::Suld1DV2I64Zero: 458 case NVPTXISD::Suld1DV4I8Zero: 459 case NVPTXISD::Suld1DV4I16Zero: 460 case NVPTXISD::Suld1DV4I32Zero: 461 case NVPTXISD::Suld1DArrayI8Zero: 462 case NVPTXISD::Suld1DArrayI16Zero: 463 case NVPTXISD::Suld1DArrayI32Zero: 464 case NVPTXISD::Suld1DArrayI64Zero: 465 case NVPTXISD::Suld1DArrayV2I8Zero: 466 case NVPTXISD::Suld1DArrayV2I16Zero: 467 case NVPTXISD::Suld1DArrayV2I32Zero: 468 case NVPTXISD::Suld1DArrayV2I64Zero: 469 case NVPTXISD::Suld1DArrayV4I8Zero: 470 case NVPTXISD::Suld1DArrayV4I16Zero: 471 case NVPTXISD::Suld1DArrayV4I32Zero: 472 case NVPTXISD::Suld2DI8Zero: 473 case NVPTXISD::Suld2DI16Zero: 474 case NVPTXISD::Suld2DI32Zero: 475 case NVPTXISD::Suld2DI64Zero: 476 case NVPTXISD::Suld2DV2I8Zero: 477 case NVPTXISD::Suld2DV2I16Zero: 478 case NVPTXISD::Suld2DV2I32Zero: 479 case NVPTXISD::Suld2DV2I64Zero: 480 case NVPTXISD::Suld2DV4I8Zero: 481 case NVPTXISD::Suld2DV4I16Zero: 482 case NVPTXISD::Suld2DV4I32Zero: 483 case NVPTXISD::Suld2DArrayI8Zero: 484 case NVPTXISD::Suld2DArrayI16Zero: 485 case NVPTXISD::Suld2DArrayI32Zero: 486 case NVPTXISD::Suld2DArrayI64Zero: 487 case NVPTXISD::Suld2DArrayV2I8Zero: 488 case NVPTXISD::Suld2DArrayV2I16Zero: 489 case NVPTXISD::Suld2DArrayV2I32Zero: 490 case NVPTXISD::Suld2DArrayV2I64Zero: 491 case NVPTXISD::Suld2DArrayV4I8Zero: 492 case NVPTXISD::Suld2DArrayV4I16Zero: 493 case NVPTXISD::Suld2DArrayV4I32Zero: 494 case NVPTXISD::Suld3DI8Zero: 495 case NVPTXISD::Suld3DI16Zero: 496 case NVPTXISD::Suld3DI32Zero: 497 case NVPTXISD::Suld3DI64Zero: 498 case NVPTXISD::Suld3DV2I8Zero: 499 case NVPTXISD::Suld3DV2I16Zero: 500 case NVPTXISD::Suld3DV2I32Zero: 501 case NVPTXISD::Suld3DV2I64Zero: 502 case NVPTXISD::Suld3DV4I8Zero: 503 case NVPTXISD::Suld3DV4I16Zero: 504 case NVPTXISD::Suld3DV4I32Zero: 505 if (trySurfaceIntrinsic(N)) 506 return; 507 break; 508 case ISD::AND: 509 case ISD::SRA: 510 case ISD::SRL: 511 // Try to select BFE 512 if (tryBFE(N)) 513 return; 514 break; 515 case ISD::ADDRSPACECAST: 516 SelectAddrSpaceCast(N); 517 return; 518 case ISD::ConstantFP: 519 if (tryConstantFP(N)) 520 return; 521 break; 522 case ISD::CopyToReg: { 523 if (N->getOperand(1).getValueType() == MVT::i128) { 524 SelectV2I64toI128(N); 525 return; 526 } 527 break; 528 } 529 case ISD::CopyFromReg: { 530 if (N->getOperand(1).getValueType() == MVT::i128) { 531 SelectI128toV2I64(N); 532 return; 533 } 534 break; 535 } 536 default: 537 break; 538 } 539 SelectCode(N); 540 } 541 542 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { 543 unsigned IID = N->getConstantOperandVal(1); 544 switch (IID) { 545 default: 546 return false; 547 case Intrinsic::nvvm_ldg_global_f: 548 case Intrinsic::nvvm_ldg_global_i: 549 case Intrinsic::nvvm_ldg_global_p: 550 case Intrinsic::nvvm_ldu_global_f: 551 case Intrinsic::nvvm_ldu_global_i: 552 case Intrinsic::nvvm_ldu_global_p: 553 return tryLDGLDU(N); 554 } 555 } 556 557 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we 558 // have to load them into an .(b)f16 register first. 559 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) { 560 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16) 561 return false; 562 SDValue Val = CurDAG->getTargetConstantFP( 563 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0)); 564 SDNode *LoadConstF16 = CurDAG->getMachineNode( 565 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16 566 : NVPTX::LOAD_CONST_BF16), 567 SDLoc(N), N->getValueType(0), Val); 568 ReplaceNode(N, LoadConstF16); 569 return true; 570 } 571 572 // Map ISD:CONDCODE value to appropriate CmpMode expected by 573 // NVPTXInstPrinter::printCmpMode() 574 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { 575 using NVPTX::PTXCmpMode::CmpMode; 576 unsigned PTXCmpMode = [](ISD::CondCode CC) { 577 switch (CC) { 578 default: 579 llvm_unreachable("Unexpected condition code."); 580 case ISD::SETOEQ: 581 return CmpMode::EQ; 582 case ISD::SETOGT: 583 return CmpMode::GT; 584 case ISD::SETOGE: 585 return CmpMode::GE; 586 case ISD::SETOLT: 587 return CmpMode::LT; 588 case ISD::SETOLE: 589 return CmpMode::LE; 590 case ISD::SETONE: 591 return CmpMode::NE; 592 case ISD::SETO: 593 return CmpMode::NUM; 594 case ISD::SETUO: 595 return CmpMode::NotANumber; 596 case ISD::SETUEQ: 597 return CmpMode::EQU; 598 case ISD::SETUGT: 599 return CmpMode::GTU; 600 case ISD::SETUGE: 601 return CmpMode::GEU; 602 case ISD::SETULT: 603 return CmpMode::LTU; 604 case ISD::SETULE: 605 return CmpMode::LEU; 606 case ISD::SETUNE: 607 return CmpMode::NEU; 608 case ISD::SETEQ: 609 return CmpMode::EQ; 610 case ISD::SETGT: 611 return CmpMode::GT; 612 case ISD::SETGE: 613 return CmpMode::GE; 614 case ISD::SETLT: 615 return CmpMode::LT; 616 case ISD::SETLE: 617 return CmpMode::LE; 618 case ISD::SETNE: 619 return CmpMode::NE; 620 } 621 }(CondCode.get()); 622 623 if (FTZ) 624 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; 625 626 return PTXCmpMode; 627 } 628 629 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { 630 unsigned PTXCmpMode = 631 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 632 SDLoc DL(N); 633 SDNode *SetP = CurDAG->getMachineNode( 634 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 635 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 636 ReplaceNode(N, SetP); 637 return true; 638 } 639 640 bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) { 641 unsigned PTXCmpMode = 642 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 643 SDLoc DL(N); 644 SDNode *SetP = CurDAG->getMachineNode( 645 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 646 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 647 ReplaceNode(N, SetP); 648 return true; 649 } 650 651 // Find all instances of extract_vector_elt that use this v2f16 vector 652 // and coalesce them into a scattering move instruction. 653 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { 654 SDValue Vector = N->getOperand(0); 655 656 // We only care about 16x2 as it's the only real vector type we 657 // need to deal with. 658 MVT VT = Vector.getSimpleValueType(); 659 if (!Isv2x16VT(VT)) 660 return false; 661 // Find and record all uses of this vector that extract element 0 or 1. 662 SmallVector<SDNode *, 4> E0, E1; 663 for (auto *U : Vector.getNode()->uses()) { 664 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 665 continue; 666 if (U->getOperand(0) != Vector) 667 continue; 668 if (const ConstantSDNode *IdxConst = 669 dyn_cast<ConstantSDNode>(U->getOperand(1))) { 670 if (IdxConst->getZExtValue() == 0) 671 E0.push_back(U); 672 else if (IdxConst->getZExtValue() == 1) 673 E1.push_back(U); 674 else 675 llvm_unreachable("Invalid vector index."); 676 } 677 } 678 679 // There's no point scattering f16x2 if we only ever access one 680 // element of it. 681 if (E0.empty() || E1.empty()) 682 return false; 683 684 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) 685 // into f16,f16 SplitF16x2(V) 686 MVT EltVT = VT.getVectorElementType(); 687 SDNode *ScatterOp = 688 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); 689 for (auto *Node : E0) 690 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); 691 for (auto *Node : E1) 692 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); 693 694 return true; 695 } 696 697 static unsigned int getCodeAddrSpace(MemSDNode *N) { 698 const Value *Src = N->getMemOperand()->getValue(); 699 700 if (!Src) 701 return NVPTX::PTXLdStInstCode::GENERIC; 702 703 if (auto *PT = dyn_cast<PointerType>(Src->getType())) { 704 switch (PT->getAddressSpace()) { 705 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; 706 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; 707 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; 708 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; 709 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; 710 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; 711 default: break; 712 } 713 } 714 return NVPTX::PTXLdStInstCode::GENERIC; 715 } 716 717 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, 718 unsigned CodeAddrSpace, MachineFunction *F) { 719 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address 720 // space. 721 // 722 // We have two ways of identifying invariant loads: Loads may be explicitly 723 // marked as invariant, or we may infer them to be invariant. 724 // 725 // We currently infer invariance for loads from 726 // - constant global variables, and 727 // - kernel function pointer params that are noalias (i.e. __restrict) and 728 // never written to. 729 // 730 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally 731 // not during the SelectionDAG phase). 732 // 733 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for 734 // explicitly invariant loads because these are how clang tells us to use ldg 735 // when the user uses a builtin. 736 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) 737 return false; 738 739 if (N->isInvariant()) 740 return true; 741 742 bool IsKernelFn = isKernelFunction(F->getFunction()); 743 744 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly 745 // because the former looks through phi nodes while the latter does not. We 746 // need to look through phi nodes to handle pointer induction variables. 747 SmallVector<const Value *, 8> Objs; 748 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); 749 750 return all_of(Objs, [&](const Value *V) { 751 if (auto *A = dyn_cast<const Argument>(V)) 752 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); 753 if (auto *GV = dyn_cast<const GlobalVariable>(V)) 754 return GV->isConstant(); 755 return false; 756 }); 757 } 758 759 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { 760 unsigned IID = N->getConstantOperandVal(0); 761 switch (IID) { 762 default: 763 return false; 764 case Intrinsic::nvvm_texsurf_handle_internal: 765 SelectTexSurfHandle(N); 766 return true; 767 } 768 } 769 770 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { 771 // Op 0 is the intrinsic ID 772 SDValue Wrapper = N->getOperand(1); 773 SDValue GlobalVal = Wrapper.getOperand(0); 774 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), 775 MVT::i64, GlobalVal)); 776 } 777 778 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { 779 SDValue Src = N->getOperand(0); 780 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); 781 unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); 782 unsigned DstAddrSpace = CastN->getDestAddressSpace(); 783 assert(SrcAddrSpace != DstAddrSpace && 784 "addrspacecast must be between different address spaces"); 785 786 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { 787 // Specific to generic 788 unsigned Opc; 789 switch (SrcAddrSpace) { 790 default: report_fatal_error("Bad address space in addrspacecast"); 791 case ADDRESS_SPACE_GLOBAL: 792 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global; 793 break; 794 case ADDRESS_SPACE_SHARED: 795 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 796 ? NVPTX::cvta_shared_6432 797 : NVPTX::cvta_shared_64) 798 : NVPTX::cvta_shared; 799 break; 800 case ADDRESS_SPACE_CONST: 801 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 802 ? NVPTX::cvta_const_6432 803 : NVPTX::cvta_const_64) 804 : NVPTX::cvta_const; 805 break; 806 case ADDRESS_SPACE_LOCAL: 807 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32 808 ? NVPTX::cvta_local_6432 809 : NVPTX::cvta_local_64) 810 : NVPTX::cvta_local; 811 break; 812 } 813 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 814 Src)); 815 return; 816 } else { 817 // Generic to specific 818 if (SrcAddrSpace != 0) 819 report_fatal_error("Cannot cast between two non-generic address spaces"); 820 unsigned Opc; 821 switch (DstAddrSpace) { 822 default: report_fatal_error("Bad address space in addrspacecast"); 823 case ADDRESS_SPACE_GLOBAL: 824 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global; 825 break; 826 case ADDRESS_SPACE_SHARED: 827 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 828 ? NVPTX::cvta_to_shared_3264 829 : NVPTX::cvta_to_shared_64) 830 : NVPTX::cvta_to_shared; 831 break; 832 case ADDRESS_SPACE_CONST: 833 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 834 ? NVPTX::cvta_to_const_3264 835 : NVPTX::cvta_to_const_64) 836 : NVPTX::cvta_to_const; 837 break; 838 case ADDRESS_SPACE_LOCAL: 839 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32 840 ? NVPTX::cvta_to_local_3264 841 : NVPTX::cvta_to_local_64) 842 : NVPTX::cvta_to_local; 843 break; 844 case ADDRESS_SPACE_PARAM: 845 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 846 : NVPTX::nvvm_ptr_gen_to_param; 847 break; 848 } 849 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 850 Src)); 851 return; 852 } 853 } 854 855 // Helper function template to reduce amount of boilerplate code for 856 // opcode selection. 857 static std::optional<unsigned> 858 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, 859 unsigned Opcode_i16, unsigned Opcode_i32, 860 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32, 861 std::optional<unsigned> Opcode_f64) { 862 switch (VT) { 863 case MVT::i1: 864 case MVT::i8: 865 return Opcode_i8; 866 case MVT::i16: 867 return Opcode_i16; 868 case MVT::i32: 869 return Opcode_i32; 870 case MVT::i64: 871 return Opcode_i64; 872 case MVT::f16: 873 case MVT::bf16: 874 return Opcode_i16; 875 case MVT::v2f16: 876 case MVT::v2bf16: 877 case MVT::v2i16: 878 case MVT::v4i8: 879 return Opcode_i32; 880 case MVT::f32: 881 return Opcode_f32; 882 case MVT::f64: 883 return Opcode_f64; 884 default: 885 return std::nullopt; 886 } 887 } 888 889 static int getLdStRegType(EVT VT) { 890 if (VT.isFloatingPoint()) 891 switch (VT.getSimpleVT().SimpleTy) { 892 case MVT::f16: 893 case MVT::bf16: 894 case MVT::v2f16: 895 case MVT::v2bf16: 896 return NVPTX::PTXLdStInstCode::Untyped; 897 default: 898 return NVPTX::PTXLdStInstCode::Float; 899 } 900 else 901 return NVPTX::PTXLdStInstCode::Unsigned; 902 } 903 904 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { 905 SDLoc dl(N); 906 MemSDNode *LD = cast<MemSDNode>(N); 907 assert(LD->readMem() && "Expected load"); 908 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); 909 EVT LoadedVT = LD->getMemoryVT(); 910 SDNode *NVPTXLD = nullptr; 911 912 // do not support pre/post inc/dec 913 if (PlainLoad && PlainLoad->isIndexed()) 914 return false; 915 916 if (!LoadedVT.isSimple()) 917 return false; 918 919 AtomicOrdering Ordering = LD->getSuccessOrdering(); 920 // In order to lower atomic loads with stronger guarantees we would need to 921 // use load.acquire or insert fences. However these features were only added 922 // with PTX ISA 6.0 / sm_70. 923 // TODO: Check if we can actually use the new instructions and implement them. 924 if (isStrongerThanMonotonic(Ordering)) 925 return false; 926 927 // Address Space Setting 928 unsigned int CodeAddrSpace = getCodeAddrSpace(LD); 929 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { 930 return tryLDGLDU(N); 931 } 932 933 unsigned int PointerSize = 934 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); 935 936 // Volatile Setting 937 // - .volatile is only available for .global and .shared 938 // - .volatile has the same memory synchronization semantics as .relaxed.sys 939 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; 940 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 941 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 942 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 943 isVolatile = false; 944 945 // Type Setting: fromType + fromTypeWidth 946 // 947 // Sign : ISD::SEXTLOAD 948 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 949 // type is integer 950 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 951 MVT SimpleVT = LoadedVT.getSimpleVT(); 952 MVT ScalarVT = SimpleVT.getScalarType(); 953 // Read at least 8 bits (predicates are stored as 8-bit values) 954 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 955 unsigned int fromType; 956 957 // Vector Setting 958 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 959 if (SimpleVT.isVector()) { 960 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && 961 "Unexpected vector type"); 962 // v2f16/v2bf16/v2i16 is loaded using ld.b32 963 fromTypeWidth = 32; 964 } 965 966 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) 967 fromType = NVPTX::PTXLdStInstCode::Signed; 968 else 969 fromType = getLdStRegType(ScalarVT); 970 971 // Create the machine instruction DAG 972 SDValue Chain = N->getOperand(0); 973 SDValue N1 = N->getOperand(1); 974 SDValue Addr; 975 SDValue Offset, Base; 976 std::optional<unsigned> Opcode; 977 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; 978 979 if (SelectDirectAddr(N1, Addr)) { 980 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, 981 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar, 982 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); 983 if (!Opcode) 984 return false; 985 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 986 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 987 getI32Imm(fromTypeWidth, dl), Addr, Chain }; 988 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 989 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) 990 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { 991 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, 992 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, 993 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); 994 if (!Opcode) 995 return false; 996 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 997 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 998 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 999 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 1000 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) 1001 : SelectADDRri(N1.getNode(), N1, Base, Offset)) { 1002 if (PointerSize == 64) 1003 Opcode = 1004 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, 1005 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, 1006 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); 1007 else 1008 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, 1009 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari, 1010 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); 1011 if (!Opcode) 1012 return false; 1013 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 1014 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 1015 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 1016 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 1017 } else { 1018 if (PointerSize == 64) 1019 Opcode = 1020 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, 1021 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, 1022 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64); 1023 else 1024 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, 1025 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg, 1026 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); 1027 if (!Opcode) 1028 return false; 1029 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 1030 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 1031 getI32Imm(fromTypeWidth, dl), N1, Chain }; 1032 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 1033 } 1034 1035 if (!NVPTXLD) 1036 return false; 1037 1038 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1039 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef}); 1040 1041 ReplaceNode(N, NVPTXLD); 1042 return true; 1043 } 1044 1045 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { 1046 1047 SDValue Chain = N->getOperand(0); 1048 SDValue Op1 = N->getOperand(1); 1049 SDValue Addr, Offset, Base; 1050 std::optional<unsigned> Opcode; 1051 SDLoc DL(N); 1052 SDNode *LD; 1053 MemSDNode *MemSD = cast<MemSDNode>(N); 1054 EVT LoadedVT = MemSD->getMemoryVT(); 1055 1056 if (!LoadedVT.isSimple()) 1057 return false; 1058 1059 // Address Space Setting 1060 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); 1061 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { 1062 return tryLDGLDU(N); 1063 } 1064 1065 unsigned int PointerSize = 1066 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1067 1068 // Volatile Setting 1069 // - .volatile is only availalble for .global and .shared 1070 bool IsVolatile = MemSD->isVolatile(); 1071 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1072 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1073 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1074 IsVolatile = false; 1075 1076 // Vector Setting 1077 MVT SimpleVT = LoadedVT.getSimpleVT(); 1078 1079 // Type Setting: fromType + fromTypeWidth 1080 // 1081 // Sign : ISD::SEXTLOAD 1082 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 1083 // type is integer 1084 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 1085 MVT ScalarVT = SimpleVT.getScalarType(); 1086 // Read at least 8 bits (predicates are stored as 8-bit values) 1087 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 1088 unsigned int FromType; 1089 // The last operand holds the original LoadSDNode::getExtensionType() value 1090 unsigned ExtensionType = cast<ConstantSDNode>( 1091 N->getOperand(N->getNumOperands() - 1))->getZExtValue(); 1092 if (ExtensionType == ISD::SEXTLOAD) 1093 FromType = NVPTX::PTXLdStInstCode::Signed; 1094 else 1095 FromType = getLdStRegType(ScalarVT); 1096 1097 unsigned VecType; 1098 1099 switch (N->getOpcode()) { 1100 case NVPTXISD::LoadV2: 1101 VecType = NVPTX::PTXLdStInstCode::V2; 1102 break; 1103 case NVPTXISD::LoadV4: 1104 VecType = NVPTX::PTXLdStInstCode::V4; 1105 break; 1106 default: 1107 return false; 1108 } 1109 1110 EVT EltVT = N->getValueType(0); 1111 1112 // v8x16 is a special case. PTX doesn't have ld.v8.16 1113 // instruction. Instead, we split the vector into v2x16 chunks and 1114 // load them with ld.v4.b32. 1115 if (Isv2x16VT(EltVT)) { 1116 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); 1117 EltVT = MVT::i32; 1118 FromType = NVPTX::PTXLdStInstCode::Untyped; 1119 FromTypeWidth = 32; 1120 } 1121 1122 if (SelectDirectAddr(Op1, Addr)) { 1123 switch (N->getOpcode()) { 1124 default: 1125 return false; 1126 case NVPTXISD::LoadV2: 1127 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1128 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, 1129 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, 1130 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); 1131 break; 1132 case NVPTXISD::LoadV4: 1133 Opcode = 1134 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, 1135 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, 1136 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt); 1137 break; 1138 } 1139 if (!Opcode) 1140 return false; 1141 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1142 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1143 getI32Imm(FromTypeWidth, DL), Addr, Chain }; 1144 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1145 } else if (PointerSize == 64 1146 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) 1147 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { 1148 switch (N->getOpcode()) { 1149 default: 1150 return false; 1151 case NVPTXISD::LoadV2: 1152 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1153 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, 1154 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, 1155 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); 1156 break; 1157 case NVPTXISD::LoadV4: 1158 Opcode = 1159 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, 1160 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, 1161 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt); 1162 break; 1163 } 1164 if (!Opcode) 1165 return false; 1166 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1167 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1168 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1169 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1170 } else if (PointerSize == 64 1171 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1172 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1173 if (PointerSize == 64) { 1174 switch (N->getOpcode()) { 1175 default: 1176 return false; 1177 case NVPTXISD::LoadV2: 1178 Opcode = 1179 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1180 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64, 1181 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64, 1182 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64); 1183 break; 1184 case NVPTXISD::LoadV4: 1185 Opcode = pickOpcodeForVT( 1186 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, 1187 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt, 1188 NVPTX::LDV_f32_v4_ari_64, std::nullopt); 1189 break; 1190 } 1191 } else { 1192 switch (N->getOpcode()) { 1193 default: 1194 return false; 1195 case NVPTXISD::LoadV2: 1196 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1197 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, 1198 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, 1199 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); 1200 break; 1201 case NVPTXISD::LoadV4: 1202 Opcode = 1203 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, 1204 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, 1205 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt); 1206 break; 1207 } 1208 } 1209 if (!Opcode) 1210 return false; 1211 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1212 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1213 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1214 1215 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1216 } else { 1217 if (PointerSize == 64) { 1218 switch (N->getOpcode()) { 1219 default: 1220 return false; 1221 case NVPTXISD::LoadV2: 1222 Opcode = pickOpcodeForVT( 1223 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, 1224 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, 1225 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, 1226 NVPTX::LDV_f64_v2_areg_64); 1227 break; 1228 case NVPTXISD::LoadV4: 1229 Opcode = pickOpcodeForVT( 1230 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, 1231 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt, 1232 NVPTX::LDV_f32_v4_areg_64, std::nullopt); 1233 break; 1234 } 1235 } else { 1236 switch (N->getOpcode()) { 1237 default: 1238 return false; 1239 case NVPTXISD::LoadV2: 1240 Opcode = 1241 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, 1242 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, 1243 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg, 1244 NVPTX::LDV_f64_v2_areg); 1245 break; 1246 case NVPTXISD::LoadV4: 1247 Opcode = 1248 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, 1249 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, 1250 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt); 1251 break; 1252 } 1253 } 1254 if (!Opcode) 1255 return false; 1256 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1257 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1258 getI32Imm(FromTypeWidth, DL), Op1, Chain }; 1259 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1260 } 1261 1262 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1263 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1264 1265 ReplaceNode(N, LD); 1266 return true; 1267 } 1268 1269 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { 1270 1271 SDValue Chain = N->getOperand(0); 1272 SDValue Op1; 1273 MemSDNode *Mem; 1274 bool IsLDG = true; 1275 1276 // If this is an LDG intrinsic, the address is the third operand. If its an 1277 // LDG/LDU SD node (from custom vector handling), then its the second operand 1278 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1279 Op1 = N->getOperand(2); 1280 Mem = cast<MemIntrinsicSDNode>(N); 1281 unsigned IID = N->getConstantOperandVal(1); 1282 switch (IID) { 1283 default: 1284 return false; 1285 case Intrinsic::nvvm_ldg_global_f: 1286 case Intrinsic::nvvm_ldg_global_i: 1287 case Intrinsic::nvvm_ldg_global_p: 1288 IsLDG = true; 1289 break; 1290 case Intrinsic::nvvm_ldu_global_f: 1291 case Intrinsic::nvvm_ldu_global_i: 1292 case Intrinsic::nvvm_ldu_global_p: 1293 IsLDG = false; 1294 break; 1295 } 1296 } else { 1297 Op1 = N->getOperand(1); 1298 Mem = cast<MemSDNode>(N); 1299 } 1300 1301 std::optional<unsigned> Opcode; 1302 SDLoc DL(N); 1303 SDNode *LD; 1304 SDValue Base, Offset, Addr; 1305 EVT OrigType = N->getValueType(0); 1306 1307 EVT EltVT = Mem->getMemoryVT(); 1308 unsigned NumElts = 1; 1309 if (EltVT.isVector()) { 1310 NumElts = EltVT.getVectorNumElements(); 1311 EltVT = EltVT.getVectorElementType(); 1312 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. 1313 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || 1314 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || 1315 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { 1316 assert(NumElts % 2 == 0 && "Vector must have even number of elements"); 1317 EltVT = OrigType; 1318 NumElts /= 2; 1319 } else if (OrigType == MVT::v4i8) { 1320 EltVT = OrigType; 1321 NumElts = 1; 1322 } 1323 } 1324 1325 // Build the "promoted" result VTList for the load. If we are really loading 1326 // i8s, then the return type will be promoted to i16 since we do not expose 1327 // 8-bit registers in NVPTX. 1328 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; 1329 SmallVector<EVT, 5> InstVTs; 1330 for (unsigned i = 0; i != NumElts; ++i) { 1331 InstVTs.push_back(NodeVT); 1332 } 1333 InstVTs.push_back(MVT::Other); 1334 SDVTList InstVTList = CurDAG->getVTList(InstVTs); 1335 1336 if (SelectDirectAddr(Op1, Addr)) { 1337 switch (N->getOpcode()) { 1338 default: 1339 return false; 1340 case ISD::LOAD: 1341 case ISD::INTRINSIC_W_CHAIN: 1342 if (IsLDG) 1343 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1344 NVPTX::INT_PTX_LDG_GLOBAL_i8avar, 1345 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, 1346 NVPTX::INT_PTX_LDG_GLOBAL_i32avar, 1347 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, 1348 NVPTX::INT_PTX_LDG_GLOBAL_f32avar, 1349 NVPTX::INT_PTX_LDG_GLOBAL_f64avar); 1350 else 1351 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1352 NVPTX::INT_PTX_LDU_GLOBAL_i8avar, 1353 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, 1354 NVPTX::INT_PTX_LDU_GLOBAL_i32avar, 1355 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, 1356 NVPTX::INT_PTX_LDU_GLOBAL_f32avar, 1357 NVPTX::INT_PTX_LDU_GLOBAL_f64avar); 1358 break; 1359 case NVPTXISD::LoadV2: 1360 case NVPTXISD::LDGV2: 1361 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1362 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, 1363 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, 1364 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, 1365 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, 1366 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, 1367 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); 1368 break; 1369 case NVPTXISD::LDUV2: 1370 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1371 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, 1372 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, 1373 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, 1374 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, 1375 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, 1376 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); 1377 break; 1378 case NVPTXISD::LoadV4: 1379 case NVPTXISD::LDGV4: 1380 Opcode = pickOpcodeForVT( 1381 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, 1382 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, 1383 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt, 1384 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt); 1385 break; 1386 case NVPTXISD::LDUV4: 1387 Opcode = pickOpcodeForVT( 1388 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, 1389 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, 1390 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt, 1391 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt); 1392 break; 1393 } 1394 if (!Opcode) 1395 return false; 1396 SDValue Ops[] = { Addr, Chain }; 1397 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1398 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1399 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1400 if (TM.is64Bit()) { 1401 switch (N->getOpcode()) { 1402 default: 1403 return false; 1404 case ISD::LOAD: 1405 case ISD::INTRINSIC_W_CHAIN: 1406 if (IsLDG) 1407 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1408 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, 1409 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, 1410 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, 1411 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, 1412 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, 1413 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); 1414 else 1415 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1416 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, 1417 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, 1418 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, 1419 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, 1420 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, 1421 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); 1422 break; 1423 case NVPTXISD::LoadV2: 1424 case NVPTXISD::LDGV2: 1425 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1426 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, 1427 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, 1428 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, 1429 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, 1430 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, 1431 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); 1432 break; 1433 case NVPTXISD::LDUV2: 1434 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1435 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, 1436 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, 1437 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, 1438 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, 1439 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, 1440 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); 1441 break; 1442 case NVPTXISD::LoadV4: 1443 case NVPTXISD::LDGV4: 1444 Opcode = pickOpcodeForVT( 1445 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, 1446 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, 1447 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt, 1448 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt); 1449 break; 1450 case NVPTXISD::LDUV4: 1451 Opcode = pickOpcodeForVT( 1452 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, 1453 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, 1454 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt, 1455 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt); 1456 break; 1457 } 1458 } else { 1459 switch (N->getOpcode()) { 1460 default: 1461 return false; 1462 case ISD::LOAD: 1463 case ISD::INTRINSIC_W_CHAIN: 1464 if (IsLDG) 1465 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1466 NVPTX::INT_PTX_LDG_GLOBAL_i8ari, 1467 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, 1468 NVPTX::INT_PTX_LDG_GLOBAL_i32ari, 1469 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, 1470 NVPTX::INT_PTX_LDG_GLOBAL_f32ari, 1471 NVPTX::INT_PTX_LDG_GLOBAL_f64ari); 1472 else 1473 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1474 NVPTX::INT_PTX_LDU_GLOBAL_i8ari, 1475 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, 1476 NVPTX::INT_PTX_LDU_GLOBAL_i32ari, 1477 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, 1478 NVPTX::INT_PTX_LDU_GLOBAL_f32ari, 1479 NVPTX::INT_PTX_LDU_GLOBAL_f64ari); 1480 break; 1481 case NVPTXISD::LoadV2: 1482 case NVPTXISD::LDGV2: 1483 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1484 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, 1485 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, 1486 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, 1487 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, 1488 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, 1489 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); 1490 break; 1491 case NVPTXISD::LDUV2: 1492 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1493 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, 1494 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, 1495 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, 1496 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, 1497 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, 1498 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); 1499 break; 1500 case NVPTXISD::LoadV4: 1501 case NVPTXISD::LDGV4: 1502 Opcode = pickOpcodeForVT( 1503 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, 1504 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, 1505 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt, 1506 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt); 1507 break; 1508 case NVPTXISD::LDUV4: 1509 Opcode = pickOpcodeForVT( 1510 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, 1511 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, 1512 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt, 1513 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt); 1514 break; 1515 } 1516 } 1517 if (!Opcode) 1518 return false; 1519 SDValue Ops[] = {Base, Offset, Chain}; 1520 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1521 } else { 1522 if (TM.is64Bit()) { 1523 switch (N->getOpcode()) { 1524 default: 1525 return false; 1526 case ISD::LOAD: 1527 case ISD::INTRINSIC_W_CHAIN: 1528 if (IsLDG) 1529 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1530 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, 1531 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, 1532 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, 1533 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, 1534 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, 1535 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); 1536 else 1537 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1538 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, 1539 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, 1540 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, 1541 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, 1542 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, 1543 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); 1544 break; 1545 case NVPTXISD::LoadV2: 1546 case NVPTXISD::LDGV2: 1547 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1548 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, 1549 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, 1550 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, 1551 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, 1552 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, 1553 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); 1554 break; 1555 case NVPTXISD::LDUV2: 1556 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1557 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, 1558 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, 1559 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, 1560 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, 1561 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, 1562 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); 1563 break; 1564 case NVPTXISD::LoadV4: 1565 case NVPTXISD::LDGV4: 1566 Opcode = pickOpcodeForVT( 1567 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, 1568 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, 1569 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt, 1570 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt); 1571 break; 1572 case NVPTXISD::LDUV4: 1573 Opcode = pickOpcodeForVT( 1574 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, 1575 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, 1576 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt, 1577 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt); 1578 break; 1579 } 1580 } else { 1581 switch (N->getOpcode()) { 1582 default: 1583 return false; 1584 case ISD::LOAD: 1585 case ISD::INTRINSIC_W_CHAIN: 1586 if (IsLDG) 1587 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1588 NVPTX::INT_PTX_LDG_GLOBAL_i8areg, 1589 NVPTX::INT_PTX_LDG_GLOBAL_i16areg, 1590 NVPTX::INT_PTX_LDG_GLOBAL_i32areg, 1591 NVPTX::INT_PTX_LDG_GLOBAL_i64areg, 1592 NVPTX::INT_PTX_LDG_GLOBAL_f32areg, 1593 NVPTX::INT_PTX_LDG_GLOBAL_f64areg); 1594 else 1595 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1596 NVPTX::INT_PTX_LDU_GLOBAL_i8areg, 1597 NVPTX::INT_PTX_LDU_GLOBAL_i16areg, 1598 NVPTX::INT_PTX_LDU_GLOBAL_i32areg, 1599 NVPTX::INT_PTX_LDU_GLOBAL_i64areg, 1600 NVPTX::INT_PTX_LDU_GLOBAL_f32areg, 1601 NVPTX::INT_PTX_LDU_GLOBAL_f64areg); 1602 break; 1603 case NVPTXISD::LoadV2: 1604 case NVPTXISD::LDGV2: 1605 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1606 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, 1607 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, 1608 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, 1609 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, 1610 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, 1611 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); 1612 break; 1613 case NVPTXISD::LDUV2: 1614 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1615 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, 1616 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, 1617 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, 1618 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, 1619 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, 1620 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); 1621 break; 1622 case NVPTXISD::LoadV4: 1623 case NVPTXISD::LDGV4: 1624 Opcode = pickOpcodeForVT( 1625 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, 1626 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, 1627 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt, 1628 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt); 1629 break; 1630 case NVPTXISD::LDUV4: 1631 Opcode = pickOpcodeForVT( 1632 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, 1633 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, 1634 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt, 1635 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt); 1636 break; 1637 } 1638 } 1639 if (!Opcode) 1640 return false; 1641 SDValue Ops[] = { Op1, Chain }; 1642 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1643 } 1644 1645 // For automatic generation of LDG (through SelectLoad[Vector], not the 1646 // intrinsics), we may have an extending load like: 1647 // 1648 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 1649 // 1650 // In this case, the matching logic above will select a load for the original 1651 // memory type (in this case, i8) and our types will not match (the node needs 1652 // to return an i32 in this case). Our LDG/LDU nodes do not support the 1653 // concept of sign-/zero-extension, so emulate it here by adding an explicit 1654 // CVT instruction. Ptxas should clean up any redundancies here. 1655 1656 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); 1657 1658 if (OrigType != EltVT && 1659 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) { 1660 // We have an extending-load. The instruction we selected operates on the 1661 // smaller type, but the SDNode we are replacing has the larger type. We 1662 // need to emit a CVT to make the types match. 1663 unsigned CvtOpc = 1664 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode); 1665 1666 // For each output value, apply the manual sign/zero-extension and make sure 1667 // all users of the load go through that CVT. 1668 for (unsigned i = 0; i != NumElts; ++i) { 1669 SDValue Res(LD, i); 1670 SDValue OrigVal(N, i); 1671 1672 SDNode *CvtNode = 1673 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, 1674 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, 1675 DL, MVT::i32)); 1676 ReplaceUses(OrigVal, SDValue(CvtNode, 0)); 1677 } 1678 } 1679 1680 ReplaceNode(N, LD); 1681 return true; 1682 } 1683 1684 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { 1685 SDLoc dl(N); 1686 MemSDNode *ST = cast<MemSDNode>(N); 1687 assert(ST->writeMem() && "Expected store"); 1688 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); 1689 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); 1690 assert((PlainStore || AtomicStore) && "Expected store"); 1691 EVT StoreVT = ST->getMemoryVT(); 1692 SDNode *NVPTXST = nullptr; 1693 1694 // do not support pre/post inc/dec 1695 if (PlainStore && PlainStore->isIndexed()) 1696 return false; 1697 1698 if (!StoreVT.isSimple()) 1699 return false; 1700 1701 AtomicOrdering Ordering = ST->getSuccessOrdering(); 1702 // In order to lower atomic loads with stronger guarantees we would need to 1703 // use store.release or insert fences. However these features were only added 1704 // with PTX ISA 6.0 / sm_70. 1705 // TODO: Check if we can actually use the new instructions and implement them. 1706 if (isStrongerThanMonotonic(Ordering)) 1707 return false; 1708 1709 // Address Space Setting 1710 unsigned int CodeAddrSpace = getCodeAddrSpace(ST); 1711 unsigned int PointerSize = 1712 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); 1713 1714 // Volatile Setting 1715 // - .volatile is only available for .global and .shared 1716 // - .volatile has the same memory synchronization semantics as .relaxed.sys 1717 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; 1718 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1719 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1720 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1721 isVolatile = false; 1722 1723 // Vector Setting 1724 MVT SimpleVT = StoreVT.getSimpleVT(); 1725 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 1726 1727 // Type Setting: toType + toTypeWidth 1728 // - for integer type, always use 'u' 1729 // 1730 MVT ScalarVT = SimpleVT.getScalarType(); 1731 unsigned toTypeWidth = ScalarVT.getSizeInBits(); 1732 if (SimpleVT.isVector()) { 1733 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && 1734 "Unexpected vector type"); 1735 // v2x16 is stored using st.b32 1736 toTypeWidth = 32; 1737 } 1738 1739 unsigned int toType = getLdStRegType(ScalarVT); 1740 1741 // Create the machine instruction DAG 1742 SDValue Chain = ST->getChain(); 1743 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); 1744 SDValue BasePtr = ST->getBasePtr(); 1745 SDValue Addr; 1746 SDValue Offset, Base; 1747 std::optional<unsigned> Opcode; 1748 MVT::SimpleValueType SourceVT = 1749 Value.getNode()->getSimpleValueType(0).SimpleTy; 1750 1751 if (SelectDirectAddr(BasePtr, Addr)) { 1752 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, 1753 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, 1754 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); 1755 if (!Opcode) 1756 return false; 1757 SDValue Ops[] = {Value, 1758 getI32Imm(isVolatile, dl), 1759 getI32Imm(CodeAddrSpace, dl), 1760 getI32Imm(vecType, dl), 1761 getI32Imm(toType, dl), 1762 getI32Imm(toTypeWidth, dl), 1763 Addr, 1764 Chain}; 1765 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1766 } else if (PointerSize == 64 1767 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) 1768 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { 1769 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, 1770 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, 1771 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); 1772 if (!Opcode) 1773 return false; 1774 SDValue Ops[] = {Value, 1775 getI32Imm(isVolatile, dl), 1776 getI32Imm(CodeAddrSpace, dl), 1777 getI32Imm(vecType, dl), 1778 getI32Imm(toType, dl), 1779 getI32Imm(toTypeWidth, dl), 1780 Base, 1781 Offset, 1782 Chain}; 1783 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1784 } else if (PointerSize == 64 1785 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) 1786 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { 1787 if (PointerSize == 64) 1788 Opcode = 1789 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, 1790 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, 1791 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); 1792 else 1793 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, 1794 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, 1795 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); 1796 if (!Opcode) 1797 return false; 1798 1799 SDValue Ops[] = {Value, 1800 getI32Imm(isVolatile, dl), 1801 getI32Imm(CodeAddrSpace, dl), 1802 getI32Imm(vecType, dl), 1803 getI32Imm(toType, dl), 1804 getI32Imm(toTypeWidth, dl), 1805 Base, 1806 Offset, 1807 Chain}; 1808 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1809 } else { 1810 if (PointerSize == 64) 1811 Opcode = 1812 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, 1813 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, 1814 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); 1815 else 1816 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, 1817 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, 1818 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); 1819 if (!Opcode) 1820 return false; 1821 SDValue Ops[] = {Value, 1822 getI32Imm(isVolatile, dl), 1823 getI32Imm(CodeAddrSpace, dl), 1824 getI32Imm(vecType, dl), 1825 getI32Imm(toType, dl), 1826 getI32Imm(toTypeWidth, dl), 1827 BasePtr, 1828 Chain}; 1829 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1830 } 1831 1832 if (!NVPTXST) 1833 return false; 1834 1835 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1836 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef}); 1837 ReplaceNode(N, NVPTXST); 1838 return true; 1839 } 1840 1841 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { 1842 SDValue Chain = N->getOperand(0); 1843 SDValue Op1 = N->getOperand(1); 1844 SDValue Addr, Offset, Base; 1845 std::optional<unsigned> Opcode; 1846 SDLoc DL(N); 1847 SDNode *ST; 1848 EVT EltVT = Op1.getValueType(); 1849 MemSDNode *MemSD = cast<MemSDNode>(N); 1850 EVT StoreVT = MemSD->getMemoryVT(); 1851 1852 // Address Space Setting 1853 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); 1854 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { 1855 report_fatal_error("Cannot store to pointer that points to constant " 1856 "memory space"); 1857 } 1858 unsigned int PointerSize = 1859 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1860 1861 // Volatile Setting 1862 // - .volatile is only availalble for .global and .shared 1863 bool IsVolatile = MemSD->isVolatile(); 1864 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1865 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1866 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1867 IsVolatile = false; 1868 1869 // Type Setting: toType + toTypeWidth 1870 // - for integer type, always use 'u' 1871 assert(StoreVT.isSimple() && "Store value is not simple"); 1872 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); 1873 unsigned ToTypeWidth = ScalarVT.getSizeInBits(); 1874 unsigned ToType = getLdStRegType(ScalarVT); 1875 1876 SmallVector<SDValue, 12> StOps; 1877 SDValue N2; 1878 unsigned VecType; 1879 1880 switch (N->getOpcode()) { 1881 case NVPTXISD::StoreV2: 1882 VecType = NVPTX::PTXLdStInstCode::V2; 1883 StOps.push_back(N->getOperand(1)); 1884 StOps.push_back(N->getOperand(2)); 1885 N2 = N->getOperand(3); 1886 break; 1887 case NVPTXISD::StoreV4: 1888 VecType = NVPTX::PTXLdStInstCode::V4; 1889 StOps.push_back(N->getOperand(1)); 1890 StOps.push_back(N->getOperand(2)); 1891 StOps.push_back(N->getOperand(3)); 1892 StOps.push_back(N->getOperand(4)); 1893 N2 = N->getOperand(5); 1894 break; 1895 default: 1896 return false; 1897 } 1898 1899 // v8x16 is a special case. PTX doesn't have st.v8.x16 1900 // instruction. Instead, we split the vector into v2x16 chunks and 1901 // store them with st.v4.b32. 1902 if (Isv2x16VT(EltVT)) { 1903 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); 1904 EltVT = MVT::i32; 1905 ToType = NVPTX::PTXLdStInstCode::Untyped; 1906 ToTypeWidth = 32; 1907 } 1908 1909 StOps.push_back(getI32Imm(IsVolatile, DL)); 1910 StOps.push_back(getI32Imm(CodeAddrSpace, DL)); 1911 StOps.push_back(getI32Imm(VecType, DL)); 1912 StOps.push_back(getI32Imm(ToType, DL)); 1913 StOps.push_back(getI32Imm(ToTypeWidth, DL)); 1914 1915 if (SelectDirectAddr(N2, Addr)) { 1916 switch (N->getOpcode()) { 1917 default: 1918 return false; 1919 case NVPTXISD::StoreV2: 1920 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1921 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, 1922 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, 1923 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); 1924 break; 1925 case NVPTXISD::StoreV4: 1926 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1927 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar, 1928 NVPTX::STV_i32_v4_avar, std::nullopt, 1929 NVPTX::STV_f32_v4_avar, std::nullopt); 1930 break; 1931 } 1932 StOps.push_back(Addr); 1933 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) 1934 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { 1935 switch (N->getOpcode()) { 1936 default: 1937 return false; 1938 case NVPTXISD::StoreV2: 1939 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1940 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, 1941 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, 1942 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); 1943 break; 1944 case NVPTXISD::StoreV4: 1945 Opcode = 1946 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, 1947 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, 1948 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt); 1949 break; 1950 } 1951 StOps.push_back(Base); 1952 StOps.push_back(Offset); 1953 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) 1954 : SelectADDRri(N2.getNode(), N2, Base, Offset)) { 1955 if (PointerSize == 64) { 1956 switch (N->getOpcode()) { 1957 default: 1958 return false; 1959 case NVPTXISD::StoreV2: 1960 Opcode = 1961 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1962 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64, 1963 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64, 1964 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64); 1965 break; 1966 case NVPTXISD::StoreV4: 1967 Opcode = pickOpcodeForVT( 1968 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, 1969 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt, 1970 NVPTX::STV_f32_v4_ari_64, std::nullopt); 1971 break; 1972 } 1973 } else { 1974 switch (N->getOpcode()) { 1975 default: 1976 return false; 1977 case NVPTXISD::StoreV2: 1978 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1979 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, 1980 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, 1981 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); 1982 break; 1983 case NVPTXISD::StoreV4: 1984 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1985 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari, 1986 NVPTX::STV_i32_v4_ari, std::nullopt, 1987 NVPTX::STV_f32_v4_ari, std::nullopt); 1988 break; 1989 } 1990 } 1991 StOps.push_back(Base); 1992 StOps.push_back(Offset); 1993 } else { 1994 if (PointerSize == 64) { 1995 switch (N->getOpcode()) { 1996 default: 1997 return false; 1998 case NVPTXISD::StoreV2: 1999 Opcode = pickOpcodeForVT( 2000 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, 2001 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, 2002 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64, 2003 NVPTX::STV_f64_v2_areg_64); 2004 break; 2005 case NVPTXISD::StoreV4: 2006 Opcode = pickOpcodeForVT( 2007 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, 2008 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt, 2009 NVPTX::STV_f32_v4_areg_64, std::nullopt); 2010 break; 2011 } 2012 } else { 2013 switch (N->getOpcode()) { 2014 default: 2015 return false; 2016 case NVPTXISD::StoreV2: 2017 Opcode = 2018 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, 2019 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, 2020 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg, 2021 NVPTX::STV_f64_v2_areg); 2022 break; 2023 case NVPTXISD::StoreV4: 2024 Opcode = 2025 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, 2026 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, 2027 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt); 2028 break; 2029 } 2030 } 2031 StOps.push_back(N2); 2032 } 2033 2034 if (!Opcode) 2035 return false; 2036 2037 StOps.push_back(Chain); 2038 2039 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); 2040 2041 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2042 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); 2043 2044 ReplaceNode(N, ST); 2045 return true; 2046 } 2047 2048 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { 2049 SDValue Chain = Node->getOperand(0); 2050 SDValue Offset = Node->getOperand(2); 2051 SDValue Glue = Node->getOperand(3); 2052 SDLoc DL(Node); 2053 MemSDNode *Mem = cast<MemSDNode>(Node); 2054 2055 unsigned VecSize; 2056 switch (Node->getOpcode()) { 2057 default: 2058 return false; 2059 case NVPTXISD::LoadParam: 2060 VecSize = 1; 2061 break; 2062 case NVPTXISD::LoadParamV2: 2063 VecSize = 2; 2064 break; 2065 case NVPTXISD::LoadParamV4: 2066 VecSize = 4; 2067 break; 2068 } 2069 2070 EVT EltVT = Node->getValueType(0); 2071 EVT MemVT = Mem->getMemoryVT(); 2072 2073 std::optional<unsigned> Opcode; 2074 2075 switch (VecSize) { 2076 default: 2077 return false; 2078 case 1: 2079 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, 2080 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, 2081 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, 2082 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); 2083 break; 2084 case 2: 2085 Opcode = 2086 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, 2087 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, 2088 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32, 2089 NVPTX::LoadParamMemV2F64); 2090 break; 2091 case 4: 2092 Opcode = 2093 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, 2094 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, 2095 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); 2096 break; 2097 } 2098 if (!Opcode) 2099 return false; 2100 2101 SDVTList VTs; 2102 if (VecSize == 1) { 2103 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); 2104 } else if (VecSize == 2) { 2105 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); 2106 } else { 2107 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; 2108 VTs = CurDAG->getVTList(EVTs); 2109 } 2110 2111 unsigned OffsetVal = Offset->getAsZExtVal(); 2112 2113 SmallVector<SDValue, 2> Ops; 2114 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2115 Ops.push_back(Chain); 2116 Ops.push_back(Glue); 2117 2118 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); 2119 return true; 2120 } 2121 2122 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { 2123 SDLoc DL(N); 2124 SDValue Chain = N->getOperand(0); 2125 SDValue Offset = N->getOperand(1); 2126 unsigned OffsetVal = Offset->getAsZExtVal(); 2127 MemSDNode *Mem = cast<MemSDNode>(N); 2128 2129 // How many elements do we have? 2130 unsigned NumElts = 1; 2131 switch (N->getOpcode()) { 2132 default: 2133 return false; 2134 case NVPTXISD::StoreRetval: 2135 NumElts = 1; 2136 break; 2137 case NVPTXISD::StoreRetvalV2: 2138 NumElts = 2; 2139 break; 2140 case NVPTXISD::StoreRetvalV4: 2141 NumElts = 4; 2142 break; 2143 } 2144 2145 // Build vector of operands 2146 SmallVector<SDValue, 6> Ops; 2147 for (unsigned i = 0; i < NumElts; ++i) 2148 Ops.push_back(N->getOperand(i + 2)); 2149 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2150 Ops.push_back(Chain); 2151 2152 // Determine target opcode 2153 // If we have an i1, use an 8-bit store. The lowering code in 2154 // NVPTXISelLowering will have already emitted an upcast. 2155 std::optional<unsigned> Opcode = 0; 2156 switch (NumElts) { 2157 default: 2158 return false; 2159 case 1: 2160 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2161 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, 2162 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, 2163 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); 2164 if (Opcode == NVPTX::StoreRetvalI8) { 2165 // Fine tune the opcode depending on the size of the operand. 2166 // This helps to avoid creating redundant COPY instructions in 2167 // InstrEmitter::AddRegisterOperand(). 2168 switch (Ops[0].getSimpleValueType().SimpleTy) { 2169 default: 2170 break; 2171 case MVT::i32: 2172 Opcode = NVPTX::StoreRetvalI8TruncI32; 2173 break; 2174 case MVT::i64: 2175 Opcode = NVPTX::StoreRetvalI8TruncI64; 2176 break; 2177 } 2178 } 2179 break; 2180 case 2: 2181 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2182 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, 2183 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, 2184 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); 2185 break; 2186 case 4: 2187 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2188 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, 2189 NVPTX::StoreRetvalV4I32, std::nullopt, 2190 NVPTX::StoreRetvalV4F32, std::nullopt); 2191 break; 2192 } 2193 if (!Opcode) 2194 return false; 2195 2196 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); 2197 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2198 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2199 2200 ReplaceNode(N, Ret); 2201 return true; 2202 } 2203 2204 // Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) 2205 #define getOpcV2H(ty, opKind0, opKind1) \ 2206 NVPTX::StoreParamV2##ty##_##opKind0##opKind1 2207 2208 #define getOpcV2H1(ty, opKind0, isImm1) \ 2209 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) 2210 2211 #define getOpcodeForVectorStParamV2(ty, isimm) \ 2212 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) 2213 2214 #define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ 2215 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 2216 2217 #define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ 2218 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ 2219 : getOpcV4H(ty, opKind0, opKind1, opKind2, r) 2220 2221 #define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ 2222 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ 2223 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) 2224 2225 #define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ 2226 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ 2227 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) 2228 2229 #define getOpcodeForVectorStParamV4(ty, isimm) \ 2230 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ 2231 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) 2232 2233 #define getOpcodeForVectorStParam(n, ty, isimm) \ 2234 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ 2235 : getOpcodeForVectorStParamV4(ty, isimm) 2236 2237 static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, 2238 unsigned NumElts, 2239 MVT::SimpleValueType MemTy, 2240 SelectionDAG *CurDAG, SDLoc DL) { 2241 // Determine which inputs are registers and immediates make new operators 2242 // with constant values 2243 SmallVector<bool, 4> IsImm(NumElts, false); 2244 for (unsigned i = 0; i < NumElts; i++) { 2245 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); 2246 if (IsImm[i]) { 2247 SDValue Imm = Ops[i]; 2248 if (MemTy == MVT::f32 || MemTy == MVT::f64) { 2249 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); 2250 const ConstantFP *CF = ConstImm->getConstantFPValue(); 2251 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); 2252 } else { 2253 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); 2254 const ConstantInt *CI = ConstImm->getConstantIntValue(); 2255 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); 2256 } 2257 Ops[i] = Imm; 2258 } 2259 } 2260 2261 // Get opcode for MemTy, size, and register/immediate operand ordering 2262 switch (MemTy) { 2263 case MVT::i8: 2264 return getOpcodeForVectorStParam(NumElts, I8, IsImm); 2265 case MVT::i16: 2266 return getOpcodeForVectorStParam(NumElts, I16, IsImm); 2267 case MVT::i32: 2268 return getOpcodeForVectorStParam(NumElts, I32, IsImm); 2269 case MVT::i64: 2270 assert(NumElts == 2 && "MVT too large for NumElts > 2"); 2271 return getOpcodeForVectorStParamV2(I64, IsImm); 2272 case MVT::f32: 2273 return getOpcodeForVectorStParam(NumElts, F32, IsImm); 2274 case MVT::f64: 2275 assert(NumElts == 2 && "MVT too large for NumElts > 2"); 2276 return getOpcodeForVectorStParamV2(F64, IsImm); 2277 2278 // These cases don't support immediates, just use the all register version 2279 // and generate moves. 2280 case MVT::i1: 2281 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr 2282 : NVPTX::StoreParamV4I8_rrrr; 2283 case MVT::f16: 2284 case MVT::bf16: 2285 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr 2286 : NVPTX::StoreParamV4I16_rrrr; 2287 case MVT::v2f16: 2288 case MVT::v2bf16: 2289 case MVT::v2i16: 2290 case MVT::v4i8: 2291 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr 2292 : NVPTX::StoreParamV4I32_rrrr; 2293 default: 2294 llvm_unreachable("Cannot select st.param for unknown MemTy"); 2295 } 2296 } 2297 2298 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { 2299 SDLoc DL(N); 2300 SDValue Chain = N->getOperand(0); 2301 SDValue Param = N->getOperand(1); 2302 unsigned ParamVal = Param->getAsZExtVal(); 2303 SDValue Offset = N->getOperand(2); 2304 unsigned OffsetVal = Offset->getAsZExtVal(); 2305 MemSDNode *Mem = cast<MemSDNode>(N); 2306 SDValue Glue = N->getOperand(N->getNumOperands() - 1); 2307 2308 // How many elements do we have? 2309 unsigned NumElts; 2310 switch (N->getOpcode()) { 2311 default: 2312 llvm_unreachable("Unexpected opcode"); 2313 case NVPTXISD::StoreParamU32: 2314 case NVPTXISD::StoreParamS32: 2315 case NVPTXISD::StoreParam: 2316 NumElts = 1; 2317 break; 2318 case NVPTXISD::StoreParamV2: 2319 NumElts = 2; 2320 break; 2321 case NVPTXISD::StoreParamV4: 2322 NumElts = 4; 2323 break; 2324 } 2325 2326 // Build vector of operands 2327 SmallVector<SDValue, 8> Ops; 2328 for (unsigned i = 0; i < NumElts; ++i) 2329 Ops.push_back(N->getOperand(i + 3)); 2330 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); 2331 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2332 Ops.push_back(Chain); 2333 Ops.push_back(Glue); 2334 2335 // Determine target opcode 2336 // If we have an i1, use an 8-bit store. The lowering code in 2337 // NVPTXISelLowering will have already emitted an upcast. 2338 std::optional<unsigned> Opcode; 2339 switch (N->getOpcode()) { 2340 default: 2341 switch (NumElts) { 2342 default: 2343 llvm_unreachable("Unexpected NumElts"); 2344 case 1: { 2345 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; 2346 SDValue Imm = Ops[0]; 2347 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 && 2348 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { 2349 // Convert immediate to target constant 2350 if (MemTy == MVT::f32 || MemTy == MVT::f64) { 2351 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); 2352 const ConstantFP *CF = ConstImm->getConstantFPValue(); 2353 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); 2354 } else { 2355 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); 2356 const ConstantInt *CI = ConstImm->getConstantIntValue(); 2357 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); 2358 } 2359 Ops[0] = Imm; 2360 // Use immediate version of store param 2361 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, 2362 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i, 2363 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i, 2364 NVPTX::StoreParamF64_i); 2365 } else 2366 Opcode = 2367 pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2368 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, 2369 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r, 2370 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r); 2371 if (Opcode == NVPTX::StoreParamI8_r) { 2372 // Fine tune the opcode depending on the size of the operand. 2373 // This helps to avoid creating redundant COPY instructions in 2374 // InstrEmitter::AddRegisterOperand(). 2375 switch (Ops[0].getSimpleValueType().SimpleTy) { 2376 default: 2377 break; 2378 case MVT::i32: 2379 Opcode = NVPTX::StoreParamI8TruncI32_r; 2380 break; 2381 case MVT::i64: 2382 Opcode = NVPTX::StoreParamI8TruncI64_r; 2383 break; 2384 } 2385 } 2386 break; 2387 } 2388 case 2: 2389 case 4: { 2390 MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; 2391 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); 2392 break; 2393 } 2394 } 2395 break; 2396 // Special case: if we have a sign-extend/zero-extend node, insert the 2397 // conversion instruction first, and use that as the value operand to 2398 // the selected StoreParam node. 2399 case NVPTXISD::StoreParamU32: { 2400 Opcode = NVPTX::StoreParamI32_r; 2401 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2402 MVT::i32); 2403 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, 2404 MVT::i32, Ops[0], CvtNone); 2405 Ops[0] = SDValue(Cvt, 0); 2406 break; 2407 } 2408 case NVPTXISD::StoreParamS32: { 2409 Opcode = NVPTX::StoreParamI32_r; 2410 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2411 MVT::i32); 2412 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, 2413 MVT::i32, Ops[0], CvtNone); 2414 Ops[0] = SDValue(Cvt, 0); 2415 break; 2416 } 2417 } 2418 2419 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 2420 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); 2421 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2422 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2423 2424 ReplaceNode(N, Ret); 2425 return true; 2426 } 2427 2428 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { 2429 unsigned Opc = 0; 2430 2431 switch (N->getOpcode()) { 2432 default: return false; 2433 case NVPTXISD::Tex1DFloatS32: 2434 Opc = NVPTX::TEX_1D_F32_S32_RR; 2435 break; 2436 case NVPTXISD::Tex1DFloatFloat: 2437 Opc = NVPTX::TEX_1D_F32_F32_RR; 2438 break; 2439 case NVPTXISD::Tex1DFloatFloatLevel: 2440 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; 2441 break; 2442 case NVPTXISD::Tex1DFloatFloatGrad: 2443 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; 2444 break; 2445 case NVPTXISD::Tex1DS32S32: 2446 Opc = NVPTX::TEX_1D_S32_S32_RR; 2447 break; 2448 case NVPTXISD::Tex1DS32Float: 2449 Opc = NVPTX::TEX_1D_S32_F32_RR; 2450 break; 2451 case NVPTXISD::Tex1DS32FloatLevel: 2452 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; 2453 break; 2454 case NVPTXISD::Tex1DS32FloatGrad: 2455 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; 2456 break; 2457 case NVPTXISD::Tex1DU32S32: 2458 Opc = NVPTX::TEX_1D_U32_S32_RR; 2459 break; 2460 case NVPTXISD::Tex1DU32Float: 2461 Opc = NVPTX::TEX_1D_U32_F32_RR; 2462 break; 2463 case NVPTXISD::Tex1DU32FloatLevel: 2464 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; 2465 break; 2466 case NVPTXISD::Tex1DU32FloatGrad: 2467 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; 2468 break; 2469 case NVPTXISD::Tex1DArrayFloatS32: 2470 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; 2471 break; 2472 case NVPTXISD::Tex1DArrayFloatFloat: 2473 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; 2474 break; 2475 case NVPTXISD::Tex1DArrayFloatFloatLevel: 2476 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; 2477 break; 2478 case NVPTXISD::Tex1DArrayFloatFloatGrad: 2479 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; 2480 break; 2481 case NVPTXISD::Tex1DArrayS32S32: 2482 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; 2483 break; 2484 case NVPTXISD::Tex1DArrayS32Float: 2485 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; 2486 break; 2487 case NVPTXISD::Tex1DArrayS32FloatLevel: 2488 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; 2489 break; 2490 case NVPTXISD::Tex1DArrayS32FloatGrad: 2491 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; 2492 break; 2493 case NVPTXISD::Tex1DArrayU32S32: 2494 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; 2495 break; 2496 case NVPTXISD::Tex1DArrayU32Float: 2497 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; 2498 break; 2499 case NVPTXISD::Tex1DArrayU32FloatLevel: 2500 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; 2501 break; 2502 case NVPTXISD::Tex1DArrayU32FloatGrad: 2503 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; 2504 break; 2505 case NVPTXISD::Tex2DFloatS32: 2506 Opc = NVPTX::TEX_2D_F32_S32_RR; 2507 break; 2508 case NVPTXISD::Tex2DFloatFloat: 2509 Opc = NVPTX::TEX_2D_F32_F32_RR; 2510 break; 2511 case NVPTXISD::Tex2DFloatFloatLevel: 2512 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; 2513 break; 2514 case NVPTXISD::Tex2DFloatFloatGrad: 2515 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; 2516 break; 2517 case NVPTXISD::Tex2DS32S32: 2518 Opc = NVPTX::TEX_2D_S32_S32_RR; 2519 break; 2520 case NVPTXISD::Tex2DS32Float: 2521 Opc = NVPTX::TEX_2D_S32_F32_RR; 2522 break; 2523 case NVPTXISD::Tex2DS32FloatLevel: 2524 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; 2525 break; 2526 case NVPTXISD::Tex2DS32FloatGrad: 2527 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; 2528 break; 2529 case NVPTXISD::Tex2DU32S32: 2530 Opc = NVPTX::TEX_2D_U32_S32_RR; 2531 break; 2532 case NVPTXISD::Tex2DU32Float: 2533 Opc = NVPTX::TEX_2D_U32_F32_RR; 2534 break; 2535 case NVPTXISD::Tex2DU32FloatLevel: 2536 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; 2537 break; 2538 case NVPTXISD::Tex2DU32FloatGrad: 2539 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; 2540 break; 2541 case NVPTXISD::Tex2DArrayFloatS32: 2542 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; 2543 break; 2544 case NVPTXISD::Tex2DArrayFloatFloat: 2545 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; 2546 break; 2547 case NVPTXISD::Tex2DArrayFloatFloatLevel: 2548 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; 2549 break; 2550 case NVPTXISD::Tex2DArrayFloatFloatGrad: 2551 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; 2552 break; 2553 case NVPTXISD::Tex2DArrayS32S32: 2554 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; 2555 break; 2556 case NVPTXISD::Tex2DArrayS32Float: 2557 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; 2558 break; 2559 case NVPTXISD::Tex2DArrayS32FloatLevel: 2560 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; 2561 break; 2562 case NVPTXISD::Tex2DArrayS32FloatGrad: 2563 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; 2564 break; 2565 case NVPTXISD::Tex2DArrayU32S32: 2566 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; 2567 break; 2568 case NVPTXISD::Tex2DArrayU32Float: 2569 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; 2570 break; 2571 case NVPTXISD::Tex2DArrayU32FloatLevel: 2572 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; 2573 break; 2574 case NVPTXISD::Tex2DArrayU32FloatGrad: 2575 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; 2576 break; 2577 case NVPTXISD::Tex3DFloatS32: 2578 Opc = NVPTX::TEX_3D_F32_S32_RR; 2579 break; 2580 case NVPTXISD::Tex3DFloatFloat: 2581 Opc = NVPTX::TEX_3D_F32_F32_RR; 2582 break; 2583 case NVPTXISD::Tex3DFloatFloatLevel: 2584 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; 2585 break; 2586 case NVPTXISD::Tex3DFloatFloatGrad: 2587 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; 2588 break; 2589 case NVPTXISD::Tex3DS32S32: 2590 Opc = NVPTX::TEX_3D_S32_S32_RR; 2591 break; 2592 case NVPTXISD::Tex3DS32Float: 2593 Opc = NVPTX::TEX_3D_S32_F32_RR; 2594 break; 2595 case NVPTXISD::Tex3DS32FloatLevel: 2596 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; 2597 break; 2598 case NVPTXISD::Tex3DS32FloatGrad: 2599 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; 2600 break; 2601 case NVPTXISD::Tex3DU32S32: 2602 Opc = NVPTX::TEX_3D_U32_S32_RR; 2603 break; 2604 case NVPTXISD::Tex3DU32Float: 2605 Opc = NVPTX::TEX_3D_U32_F32_RR; 2606 break; 2607 case NVPTXISD::Tex3DU32FloatLevel: 2608 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; 2609 break; 2610 case NVPTXISD::Tex3DU32FloatGrad: 2611 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; 2612 break; 2613 case NVPTXISD::TexCubeFloatFloat: 2614 Opc = NVPTX::TEX_CUBE_F32_F32_RR; 2615 break; 2616 case NVPTXISD::TexCubeFloatFloatLevel: 2617 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; 2618 break; 2619 case NVPTXISD::TexCubeS32Float: 2620 Opc = NVPTX::TEX_CUBE_S32_F32_RR; 2621 break; 2622 case NVPTXISD::TexCubeS32FloatLevel: 2623 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; 2624 break; 2625 case NVPTXISD::TexCubeU32Float: 2626 Opc = NVPTX::TEX_CUBE_U32_F32_RR; 2627 break; 2628 case NVPTXISD::TexCubeU32FloatLevel: 2629 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; 2630 break; 2631 case NVPTXISD::TexCubeArrayFloatFloat: 2632 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; 2633 break; 2634 case NVPTXISD::TexCubeArrayFloatFloatLevel: 2635 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; 2636 break; 2637 case NVPTXISD::TexCubeArrayS32Float: 2638 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; 2639 break; 2640 case NVPTXISD::TexCubeArrayS32FloatLevel: 2641 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; 2642 break; 2643 case NVPTXISD::TexCubeArrayU32Float: 2644 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; 2645 break; 2646 case NVPTXISD::TexCubeArrayU32FloatLevel: 2647 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; 2648 break; 2649 case NVPTXISD::Tld4R2DFloatFloat: 2650 Opc = NVPTX::TLD4_R_2D_F32_F32_RR; 2651 break; 2652 case NVPTXISD::Tld4G2DFloatFloat: 2653 Opc = NVPTX::TLD4_G_2D_F32_F32_RR; 2654 break; 2655 case NVPTXISD::Tld4B2DFloatFloat: 2656 Opc = NVPTX::TLD4_B_2D_F32_F32_RR; 2657 break; 2658 case NVPTXISD::Tld4A2DFloatFloat: 2659 Opc = NVPTX::TLD4_A_2D_F32_F32_RR; 2660 break; 2661 case NVPTXISD::Tld4R2DS64Float: 2662 Opc = NVPTX::TLD4_R_2D_S32_F32_RR; 2663 break; 2664 case NVPTXISD::Tld4G2DS64Float: 2665 Opc = NVPTX::TLD4_G_2D_S32_F32_RR; 2666 break; 2667 case NVPTXISD::Tld4B2DS64Float: 2668 Opc = NVPTX::TLD4_B_2D_S32_F32_RR; 2669 break; 2670 case NVPTXISD::Tld4A2DS64Float: 2671 Opc = NVPTX::TLD4_A_2D_S32_F32_RR; 2672 break; 2673 case NVPTXISD::Tld4R2DU64Float: 2674 Opc = NVPTX::TLD4_R_2D_U32_F32_RR; 2675 break; 2676 case NVPTXISD::Tld4G2DU64Float: 2677 Opc = NVPTX::TLD4_G_2D_U32_F32_RR; 2678 break; 2679 case NVPTXISD::Tld4B2DU64Float: 2680 Opc = NVPTX::TLD4_B_2D_U32_F32_RR; 2681 break; 2682 case NVPTXISD::Tld4A2DU64Float: 2683 Opc = NVPTX::TLD4_A_2D_U32_F32_RR; 2684 break; 2685 case NVPTXISD::TexUnified1DFloatS32: 2686 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; 2687 break; 2688 case NVPTXISD::TexUnified1DFloatFloat: 2689 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; 2690 break; 2691 case NVPTXISD::TexUnified1DFloatFloatLevel: 2692 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; 2693 break; 2694 case NVPTXISD::TexUnified1DFloatFloatGrad: 2695 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; 2696 break; 2697 case NVPTXISD::TexUnified1DS32S32: 2698 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; 2699 break; 2700 case NVPTXISD::TexUnified1DS32Float: 2701 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; 2702 break; 2703 case NVPTXISD::TexUnified1DS32FloatLevel: 2704 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; 2705 break; 2706 case NVPTXISD::TexUnified1DS32FloatGrad: 2707 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; 2708 break; 2709 case NVPTXISD::TexUnified1DU32S32: 2710 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; 2711 break; 2712 case NVPTXISD::TexUnified1DU32Float: 2713 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; 2714 break; 2715 case NVPTXISD::TexUnified1DU32FloatLevel: 2716 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; 2717 break; 2718 case NVPTXISD::TexUnified1DU32FloatGrad: 2719 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; 2720 break; 2721 case NVPTXISD::TexUnified1DArrayFloatS32: 2722 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; 2723 break; 2724 case NVPTXISD::TexUnified1DArrayFloatFloat: 2725 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; 2726 break; 2727 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 2728 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; 2729 break; 2730 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 2731 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; 2732 break; 2733 case NVPTXISD::TexUnified1DArrayS32S32: 2734 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; 2735 break; 2736 case NVPTXISD::TexUnified1DArrayS32Float: 2737 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; 2738 break; 2739 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 2740 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; 2741 break; 2742 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 2743 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; 2744 break; 2745 case NVPTXISD::TexUnified1DArrayU32S32: 2746 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; 2747 break; 2748 case NVPTXISD::TexUnified1DArrayU32Float: 2749 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; 2750 break; 2751 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 2752 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; 2753 break; 2754 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 2755 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; 2756 break; 2757 case NVPTXISD::TexUnified2DFloatS32: 2758 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; 2759 break; 2760 case NVPTXISD::TexUnified2DFloatFloat: 2761 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; 2762 break; 2763 case NVPTXISD::TexUnified2DFloatFloatLevel: 2764 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; 2765 break; 2766 case NVPTXISD::TexUnified2DFloatFloatGrad: 2767 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; 2768 break; 2769 case NVPTXISD::TexUnified2DS32S32: 2770 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; 2771 break; 2772 case NVPTXISD::TexUnified2DS32Float: 2773 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; 2774 break; 2775 case NVPTXISD::TexUnified2DS32FloatLevel: 2776 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; 2777 break; 2778 case NVPTXISD::TexUnified2DS32FloatGrad: 2779 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; 2780 break; 2781 case NVPTXISD::TexUnified2DU32S32: 2782 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; 2783 break; 2784 case NVPTXISD::TexUnified2DU32Float: 2785 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; 2786 break; 2787 case NVPTXISD::TexUnified2DU32FloatLevel: 2788 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; 2789 break; 2790 case NVPTXISD::TexUnified2DU32FloatGrad: 2791 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; 2792 break; 2793 case NVPTXISD::TexUnified2DArrayFloatS32: 2794 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; 2795 break; 2796 case NVPTXISD::TexUnified2DArrayFloatFloat: 2797 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; 2798 break; 2799 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 2800 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; 2801 break; 2802 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 2803 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; 2804 break; 2805 case NVPTXISD::TexUnified2DArrayS32S32: 2806 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; 2807 break; 2808 case NVPTXISD::TexUnified2DArrayS32Float: 2809 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; 2810 break; 2811 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 2812 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; 2813 break; 2814 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 2815 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; 2816 break; 2817 case NVPTXISD::TexUnified2DArrayU32S32: 2818 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; 2819 break; 2820 case NVPTXISD::TexUnified2DArrayU32Float: 2821 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; 2822 break; 2823 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 2824 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; 2825 break; 2826 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 2827 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; 2828 break; 2829 case NVPTXISD::TexUnified3DFloatS32: 2830 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; 2831 break; 2832 case NVPTXISD::TexUnified3DFloatFloat: 2833 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; 2834 break; 2835 case NVPTXISD::TexUnified3DFloatFloatLevel: 2836 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; 2837 break; 2838 case NVPTXISD::TexUnified3DFloatFloatGrad: 2839 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; 2840 break; 2841 case NVPTXISD::TexUnified3DS32S32: 2842 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; 2843 break; 2844 case NVPTXISD::TexUnified3DS32Float: 2845 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; 2846 break; 2847 case NVPTXISD::TexUnified3DS32FloatLevel: 2848 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; 2849 break; 2850 case NVPTXISD::TexUnified3DS32FloatGrad: 2851 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; 2852 break; 2853 case NVPTXISD::TexUnified3DU32S32: 2854 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; 2855 break; 2856 case NVPTXISD::TexUnified3DU32Float: 2857 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; 2858 break; 2859 case NVPTXISD::TexUnified3DU32FloatLevel: 2860 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; 2861 break; 2862 case NVPTXISD::TexUnified3DU32FloatGrad: 2863 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; 2864 break; 2865 case NVPTXISD::TexUnifiedCubeFloatFloat: 2866 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; 2867 break; 2868 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 2869 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; 2870 break; 2871 case NVPTXISD::TexUnifiedCubeS32Float: 2872 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; 2873 break; 2874 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 2875 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; 2876 break; 2877 case NVPTXISD::TexUnifiedCubeU32Float: 2878 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; 2879 break; 2880 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 2881 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; 2882 break; 2883 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 2884 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; 2885 break; 2886 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 2887 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; 2888 break; 2889 case NVPTXISD::TexUnifiedCubeArrayS32Float: 2890 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; 2891 break; 2892 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 2893 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; 2894 break; 2895 case NVPTXISD::TexUnifiedCubeArrayU32Float: 2896 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; 2897 break; 2898 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 2899 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; 2900 break; 2901 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 2902 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; 2903 break; 2904 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 2905 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; 2906 break; 2907 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 2908 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; 2909 break; 2910 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 2911 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; 2912 break; 2913 case NVPTXISD::Tld4UnifiedR2DS64Float: 2914 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; 2915 break; 2916 case NVPTXISD::Tld4UnifiedG2DS64Float: 2917 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; 2918 break; 2919 case NVPTXISD::Tld4UnifiedB2DS64Float: 2920 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; 2921 break; 2922 case NVPTXISD::Tld4UnifiedA2DS64Float: 2923 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; 2924 break; 2925 case NVPTXISD::Tld4UnifiedR2DU64Float: 2926 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; 2927 break; 2928 case NVPTXISD::Tld4UnifiedG2DU64Float: 2929 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; 2930 break; 2931 case NVPTXISD::Tld4UnifiedB2DU64Float: 2932 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; 2933 break; 2934 case NVPTXISD::Tld4UnifiedA2DU64Float: 2935 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; 2936 break; 2937 case NVPTXISD::TexUnifiedCubeFloatFloatGrad: 2938 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R; 2939 break; 2940 case NVPTXISD::TexUnifiedCubeS32FloatGrad: 2941 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R; 2942 break; 2943 case NVPTXISD::TexUnifiedCubeU32FloatGrad: 2944 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R; 2945 break; 2946 case NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad: 2947 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R; 2948 break; 2949 case NVPTXISD::TexUnifiedCubeArrayS32FloatGrad: 2950 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R; 2951 break; 2952 case NVPTXISD::TexUnifiedCubeArrayU32FloatGrad: 2953 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R; 2954 break; 2955 } 2956 2957 // Copy over operands 2958 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 2959 Ops.push_back(N->getOperand(0)); // Move chain to the back. 2960 2961 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 2962 return true; 2963 } 2964 2965 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { 2966 unsigned Opc = 0; 2967 switch (N->getOpcode()) { 2968 default: return false; 2969 case NVPTXISD::Suld1DI8Clamp: 2970 Opc = NVPTX::SULD_1D_I8_CLAMP_R; 2971 break; 2972 case NVPTXISD::Suld1DI16Clamp: 2973 Opc = NVPTX::SULD_1D_I16_CLAMP_R; 2974 break; 2975 case NVPTXISD::Suld1DI32Clamp: 2976 Opc = NVPTX::SULD_1D_I32_CLAMP_R; 2977 break; 2978 case NVPTXISD::Suld1DI64Clamp: 2979 Opc = NVPTX::SULD_1D_I64_CLAMP_R; 2980 break; 2981 case NVPTXISD::Suld1DV2I8Clamp: 2982 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; 2983 break; 2984 case NVPTXISD::Suld1DV2I16Clamp: 2985 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; 2986 break; 2987 case NVPTXISD::Suld1DV2I32Clamp: 2988 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; 2989 break; 2990 case NVPTXISD::Suld1DV2I64Clamp: 2991 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; 2992 break; 2993 case NVPTXISD::Suld1DV4I8Clamp: 2994 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; 2995 break; 2996 case NVPTXISD::Suld1DV4I16Clamp: 2997 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; 2998 break; 2999 case NVPTXISD::Suld1DV4I32Clamp: 3000 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; 3001 break; 3002 case NVPTXISD::Suld1DArrayI8Clamp: 3003 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; 3004 break; 3005 case NVPTXISD::Suld1DArrayI16Clamp: 3006 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; 3007 break; 3008 case NVPTXISD::Suld1DArrayI32Clamp: 3009 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; 3010 break; 3011 case NVPTXISD::Suld1DArrayI64Clamp: 3012 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; 3013 break; 3014 case NVPTXISD::Suld1DArrayV2I8Clamp: 3015 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; 3016 break; 3017 case NVPTXISD::Suld1DArrayV2I16Clamp: 3018 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; 3019 break; 3020 case NVPTXISD::Suld1DArrayV2I32Clamp: 3021 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; 3022 break; 3023 case NVPTXISD::Suld1DArrayV2I64Clamp: 3024 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; 3025 break; 3026 case NVPTXISD::Suld1DArrayV4I8Clamp: 3027 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; 3028 break; 3029 case NVPTXISD::Suld1DArrayV4I16Clamp: 3030 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; 3031 break; 3032 case NVPTXISD::Suld1DArrayV4I32Clamp: 3033 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; 3034 break; 3035 case NVPTXISD::Suld2DI8Clamp: 3036 Opc = NVPTX::SULD_2D_I8_CLAMP_R; 3037 break; 3038 case NVPTXISD::Suld2DI16Clamp: 3039 Opc = NVPTX::SULD_2D_I16_CLAMP_R; 3040 break; 3041 case NVPTXISD::Suld2DI32Clamp: 3042 Opc = NVPTX::SULD_2D_I32_CLAMP_R; 3043 break; 3044 case NVPTXISD::Suld2DI64Clamp: 3045 Opc = NVPTX::SULD_2D_I64_CLAMP_R; 3046 break; 3047 case NVPTXISD::Suld2DV2I8Clamp: 3048 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; 3049 break; 3050 case NVPTXISD::Suld2DV2I16Clamp: 3051 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; 3052 break; 3053 case NVPTXISD::Suld2DV2I32Clamp: 3054 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; 3055 break; 3056 case NVPTXISD::Suld2DV2I64Clamp: 3057 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; 3058 break; 3059 case NVPTXISD::Suld2DV4I8Clamp: 3060 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; 3061 break; 3062 case NVPTXISD::Suld2DV4I16Clamp: 3063 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; 3064 break; 3065 case NVPTXISD::Suld2DV4I32Clamp: 3066 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; 3067 break; 3068 case NVPTXISD::Suld2DArrayI8Clamp: 3069 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; 3070 break; 3071 case NVPTXISD::Suld2DArrayI16Clamp: 3072 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; 3073 break; 3074 case NVPTXISD::Suld2DArrayI32Clamp: 3075 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; 3076 break; 3077 case NVPTXISD::Suld2DArrayI64Clamp: 3078 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; 3079 break; 3080 case NVPTXISD::Suld2DArrayV2I8Clamp: 3081 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; 3082 break; 3083 case NVPTXISD::Suld2DArrayV2I16Clamp: 3084 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; 3085 break; 3086 case NVPTXISD::Suld2DArrayV2I32Clamp: 3087 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; 3088 break; 3089 case NVPTXISD::Suld2DArrayV2I64Clamp: 3090 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; 3091 break; 3092 case NVPTXISD::Suld2DArrayV4I8Clamp: 3093 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; 3094 break; 3095 case NVPTXISD::Suld2DArrayV4I16Clamp: 3096 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; 3097 break; 3098 case NVPTXISD::Suld2DArrayV4I32Clamp: 3099 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; 3100 break; 3101 case NVPTXISD::Suld3DI8Clamp: 3102 Opc = NVPTX::SULD_3D_I8_CLAMP_R; 3103 break; 3104 case NVPTXISD::Suld3DI16Clamp: 3105 Opc = NVPTX::SULD_3D_I16_CLAMP_R; 3106 break; 3107 case NVPTXISD::Suld3DI32Clamp: 3108 Opc = NVPTX::SULD_3D_I32_CLAMP_R; 3109 break; 3110 case NVPTXISD::Suld3DI64Clamp: 3111 Opc = NVPTX::SULD_3D_I64_CLAMP_R; 3112 break; 3113 case NVPTXISD::Suld3DV2I8Clamp: 3114 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; 3115 break; 3116 case NVPTXISD::Suld3DV2I16Clamp: 3117 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; 3118 break; 3119 case NVPTXISD::Suld3DV2I32Clamp: 3120 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; 3121 break; 3122 case NVPTXISD::Suld3DV2I64Clamp: 3123 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; 3124 break; 3125 case NVPTXISD::Suld3DV4I8Clamp: 3126 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; 3127 break; 3128 case NVPTXISD::Suld3DV4I16Clamp: 3129 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; 3130 break; 3131 case NVPTXISD::Suld3DV4I32Clamp: 3132 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; 3133 break; 3134 case NVPTXISD::Suld1DI8Trap: 3135 Opc = NVPTX::SULD_1D_I8_TRAP_R; 3136 break; 3137 case NVPTXISD::Suld1DI16Trap: 3138 Opc = NVPTX::SULD_1D_I16_TRAP_R; 3139 break; 3140 case NVPTXISD::Suld1DI32Trap: 3141 Opc = NVPTX::SULD_1D_I32_TRAP_R; 3142 break; 3143 case NVPTXISD::Suld1DI64Trap: 3144 Opc = NVPTX::SULD_1D_I64_TRAP_R; 3145 break; 3146 case NVPTXISD::Suld1DV2I8Trap: 3147 Opc = NVPTX::SULD_1D_V2I8_TRAP_R; 3148 break; 3149 case NVPTXISD::Suld1DV2I16Trap: 3150 Opc = NVPTX::SULD_1D_V2I16_TRAP_R; 3151 break; 3152 case NVPTXISD::Suld1DV2I32Trap: 3153 Opc = NVPTX::SULD_1D_V2I32_TRAP_R; 3154 break; 3155 case NVPTXISD::Suld1DV2I64Trap: 3156 Opc = NVPTX::SULD_1D_V2I64_TRAP_R; 3157 break; 3158 case NVPTXISD::Suld1DV4I8Trap: 3159 Opc = NVPTX::SULD_1D_V4I8_TRAP_R; 3160 break; 3161 case NVPTXISD::Suld1DV4I16Trap: 3162 Opc = NVPTX::SULD_1D_V4I16_TRAP_R; 3163 break; 3164 case NVPTXISD::Suld1DV4I32Trap: 3165 Opc = NVPTX::SULD_1D_V4I32_TRAP_R; 3166 break; 3167 case NVPTXISD::Suld1DArrayI8Trap: 3168 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; 3169 break; 3170 case NVPTXISD::Suld1DArrayI16Trap: 3171 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; 3172 break; 3173 case NVPTXISD::Suld1DArrayI32Trap: 3174 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; 3175 break; 3176 case NVPTXISD::Suld1DArrayI64Trap: 3177 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; 3178 break; 3179 case NVPTXISD::Suld1DArrayV2I8Trap: 3180 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; 3181 break; 3182 case NVPTXISD::Suld1DArrayV2I16Trap: 3183 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; 3184 break; 3185 case NVPTXISD::Suld1DArrayV2I32Trap: 3186 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; 3187 break; 3188 case NVPTXISD::Suld1DArrayV2I64Trap: 3189 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; 3190 break; 3191 case NVPTXISD::Suld1DArrayV4I8Trap: 3192 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; 3193 break; 3194 case NVPTXISD::Suld1DArrayV4I16Trap: 3195 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; 3196 break; 3197 case NVPTXISD::Suld1DArrayV4I32Trap: 3198 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; 3199 break; 3200 case NVPTXISD::Suld2DI8Trap: 3201 Opc = NVPTX::SULD_2D_I8_TRAP_R; 3202 break; 3203 case NVPTXISD::Suld2DI16Trap: 3204 Opc = NVPTX::SULD_2D_I16_TRAP_R; 3205 break; 3206 case NVPTXISD::Suld2DI32Trap: 3207 Opc = NVPTX::SULD_2D_I32_TRAP_R; 3208 break; 3209 case NVPTXISD::Suld2DI64Trap: 3210 Opc = NVPTX::SULD_2D_I64_TRAP_R; 3211 break; 3212 case NVPTXISD::Suld2DV2I8Trap: 3213 Opc = NVPTX::SULD_2D_V2I8_TRAP_R; 3214 break; 3215 case NVPTXISD::Suld2DV2I16Trap: 3216 Opc = NVPTX::SULD_2D_V2I16_TRAP_R; 3217 break; 3218 case NVPTXISD::Suld2DV2I32Trap: 3219 Opc = NVPTX::SULD_2D_V2I32_TRAP_R; 3220 break; 3221 case NVPTXISD::Suld2DV2I64Trap: 3222 Opc = NVPTX::SULD_2D_V2I64_TRAP_R; 3223 break; 3224 case NVPTXISD::Suld2DV4I8Trap: 3225 Opc = NVPTX::SULD_2D_V4I8_TRAP_R; 3226 break; 3227 case NVPTXISD::Suld2DV4I16Trap: 3228 Opc = NVPTX::SULD_2D_V4I16_TRAP_R; 3229 break; 3230 case NVPTXISD::Suld2DV4I32Trap: 3231 Opc = NVPTX::SULD_2D_V4I32_TRAP_R; 3232 break; 3233 case NVPTXISD::Suld2DArrayI8Trap: 3234 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; 3235 break; 3236 case NVPTXISD::Suld2DArrayI16Trap: 3237 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; 3238 break; 3239 case NVPTXISD::Suld2DArrayI32Trap: 3240 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; 3241 break; 3242 case NVPTXISD::Suld2DArrayI64Trap: 3243 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; 3244 break; 3245 case NVPTXISD::Suld2DArrayV2I8Trap: 3246 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; 3247 break; 3248 case NVPTXISD::Suld2DArrayV2I16Trap: 3249 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; 3250 break; 3251 case NVPTXISD::Suld2DArrayV2I32Trap: 3252 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; 3253 break; 3254 case NVPTXISD::Suld2DArrayV2I64Trap: 3255 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; 3256 break; 3257 case NVPTXISD::Suld2DArrayV4I8Trap: 3258 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; 3259 break; 3260 case NVPTXISD::Suld2DArrayV4I16Trap: 3261 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; 3262 break; 3263 case NVPTXISD::Suld2DArrayV4I32Trap: 3264 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; 3265 break; 3266 case NVPTXISD::Suld3DI8Trap: 3267 Opc = NVPTX::SULD_3D_I8_TRAP_R; 3268 break; 3269 case NVPTXISD::Suld3DI16Trap: 3270 Opc = NVPTX::SULD_3D_I16_TRAP_R; 3271 break; 3272 case NVPTXISD::Suld3DI32Trap: 3273 Opc = NVPTX::SULD_3D_I32_TRAP_R; 3274 break; 3275 case NVPTXISD::Suld3DI64Trap: 3276 Opc = NVPTX::SULD_3D_I64_TRAP_R; 3277 break; 3278 case NVPTXISD::Suld3DV2I8Trap: 3279 Opc = NVPTX::SULD_3D_V2I8_TRAP_R; 3280 break; 3281 case NVPTXISD::Suld3DV2I16Trap: 3282 Opc = NVPTX::SULD_3D_V2I16_TRAP_R; 3283 break; 3284 case NVPTXISD::Suld3DV2I32Trap: 3285 Opc = NVPTX::SULD_3D_V2I32_TRAP_R; 3286 break; 3287 case NVPTXISD::Suld3DV2I64Trap: 3288 Opc = NVPTX::SULD_3D_V2I64_TRAP_R; 3289 break; 3290 case NVPTXISD::Suld3DV4I8Trap: 3291 Opc = NVPTX::SULD_3D_V4I8_TRAP_R; 3292 break; 3293 case NVPTXISD::Suld3DV4I16Trap: 3294 Opc = NVPTX::SULD_3D_V4I16_TRAP_R; 3295 break; 3296 case NVPTXISD::Suld3DV4I32Trap: 3297 Opc = NVPTX::SULD_3D_V4I32_TRAP_R; 3298 break; 3299 case NVPTXISD::Suld1DI8Zero: 3300 Opc = NVPTX::SULD_1D_I8_ZERO_R; 3301 break; 3302 case NVPTXISD::Suld1DI16Zero: 3303 Opc = NVPTX::SULD_1D_I16_ZERO_R; 3304 break; 3305 case NVPTXISD::Suld1DI32Zero: 3306 Opc = NVPTX::SULD_1D_I32_ZERO_R; 3307 break; 3308 case NVPTXISD::Suld1DI64Zero: 3309 Opc = NVPTX::SULD_1D_I64_ZERO_R; 3310 break; 3311 case NVPTXISD::Suld1DV2I8Zero: 3312 Opc = NVPTX::SULD_1D_V2I8_ZERO_R; 3313 break; 3314 case NVPTXISD::Suld1DV2I16Zero: 3315 Opc = NVPTX::SULD_1D_V2I16_ZERO_R; 3316 break; 3317 case NVPTXISD::Suld1DV2I32Zero: 3318 Opc = NVPTX::SULD_1D_V2I32_ZERO_R; 3319 break; 3320 case NVPTXISD::Suld1DV2I64Zero: 3321 Opc = NVPTX::SULD_1D_V2I64_ZERO_R; 3322 break; 3323 case NVPTXISD::Suld1DV4I8Zero: 3324 Opc = NVPTX::SULD_1D_V4I8_ZERO_R; 3325 break; 3326 case NVPTXISD::Suld1DV4I16Zero: 3327 Opc = NVPTX::SULD_1D_V4I16_ZERO_R; 3328 break; 3329 case NVPTXISD::Suld1DV4I32Zero: 3330 Opc = NVPTX::SULD_1D_V4I32_ZERO_R; 3331 break; 3332 case NVPTXISD::Suld1DArrayI8Zero: 3333 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; 3334 break; 3335 case NVPTXISD::Suld1DArrayI16Zero: 3336 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; 3337 break; 3338 case NVPTXISD::Suld1DArrayI32Zero: 3339 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; 3340 break; 3341 case NVPTXISD::Suld1DArrayI64Zero: 3342 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; 3343 break; 3344 case NVPTXISD::Suld1DArrayV2I8Zero: 3345 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; 3346 break; 3347 case NVPTXISD::Suld1DArrayV2I16Zero: 3348 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; 3349 break; 3350 case NVPTXISD::Suld1DArrayV2I32Zero: 3351 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; 3352 break; 3353 case NVPTXISD::Suld1DArrayV2I64Zero: 3354 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; 3355 break; 3356 case NVPTXISD::Suld1DArrayV4I8Zero: 3357 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; 3358 break; 3359 case NVPTXISD::Suld1DArrayV4I16Zero: 3360 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; 3361 break; 3362 case NVPTXISD::Suld1DArrayV4I32Zero: 3363 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; 3364 break; 3365 case NVPTXISD::Suld2DI8Zero: 3366 Opc = NVPTX::SULD_2D_I8_ZERO_R; 3367 break; 3368 case NVPTXISD::Suld2DI16Zero: 3369 Opc = NVPTX::SULD_2D_I16_ZERO_R; 3370 break; 3371 case NVPTXISD::Suld2DI32Zero: 3372 Opc = NVPTX::SULD_2D_I32_ZERO_R; 3373 break; 3374 case NVPTXISD::Suld2DI64Zero: 3375 Opc = NVPTX::SULD_2D_I64_ZERO_R; 3376 break; 3377 case NVPTXISD::Suld2DV2I8Zero: 3378 Opc = NVPTX::SULD_2D_V2I8_ZERO_R; 3379 break; 3380 case NVPTXISD::Suld2DV2I16Zero: 3381 Opc = NVPTX::SULD_2D_V2I16_ZERO_R; 3382 break; 3383 case NVPTXISD::Suld2DV2I32Zero: 3384 Opc = NVPTX::SULD_2D_V2I32_ZERO_R; 3385 break; 3386 case NVPTXISD::Suld2DV2I64Zero: 3387 Opc = NVPTX::SULD_2D_V2I64_ZERO_R; 3388 break; 3389 case NVPTXISD::Suld2DV4I8Zero: 3390 Opc = NVPTX::SULD_2D_V4I8_ZERO_R; 3391 break; 3392 case NVPTXISD::Suld2DV4I16Zero: 3393 Opc = NVPTX::SULD_2D_V4I16_ZERO_R; 3394 break; 3395 case NVPTXISD::Suld2DV4I32Zero: 3396 Opc = NVPTX::SULD_2D_V4I32_ZERO_R; 3397 break; 3398 case NVPTXISD::Suld2DArrayI8Zero: 3399 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; 3400 break; 3401 case NVPTXISD::Suld2DArrayI16Zero: 3402 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; 3403 break; 3404 case NVPTXISD::Suld2DArrayI32Zero: 3405 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; 3406 break; 3407 case NVPTXISD::Suld2DArrayI64Zero: 3408 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; 3409 break; 3410 case NVPTXISD::Suld2DArrayV2I8Zero: 3411 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; 3412 break; 3413 case NVPTXISD::Suld2DArrayV2I16Zero: 3414 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; 3415 break; 3416 case NVPTXISD::Suld2DArrayV2I32Zero: 3417 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; 3418 break; 3419 case NVPTXISD::Suld2DArrayV2I64Zero: 3420 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; 3421 break; 3422 case NVPTXISD::Suld2DArrayV4I8Zero: 3423 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; 3424 break; 3425 case NVPTXISD::Suld2DArrayV4I16Zero: 3426 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; 3427 break; 3428 case NVPTXISD::Suld2DArrayV4I32Zero: 3429 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; 3430 break; 3431 case NVPTXISD::Suld3DI8Zero: 3432 Opc = NVPTX::SULD_3D_I8_ZERO_R; 3433 break; 3434 case NVPTXISD::Suld3DI16Zero: 3435 Opc = NVPTX::SULD_3D_I16_ZERO_R; 3436 break; 3437 case NVPTXISD::Suld3DI32Zero: 3438 Opc = NVPTX::SULD_3D_I32_ZERO_R; 3439 break; 3440 case NVPTXISD::Suld3DI64Zero: 3441 Opc = NVPTX::SULD_3D_I64_ZERO_R; 3442 break; 3443 case NVPTXISD::Suld3DV2I8Zero: 3444 Opc = NVPTX::SULD_3D_V2I8_ZERO_R; 3445 break; 3446 case NVPTXISD::Suld3DV2I16Zero: 3447 Opc = NVPTX::SULD_3D_V2I16_ZERO_R; 3448 break; 3449 case NVPTXISD::Suld3DV2I32Zero: 3450 Opc = NVPTX::SULD_3D_V2I32_ZERO_R; 3451 break; 3452 case NVPTXISD::Suld3DV2I64Zero: 3453 Opc = NVPTX::SULD_3D_V2I64_ZERO_R; 3454 break; 3455 case NVPTXISD::Suld3DV4I8Zero: 3456 Opc = NVPTX::SULD_3D_V4I8_ZERO_R; 3457 break; 3458 case NVPTXISD::Suld3DV4I16Zero: 3459 Opc = NVPTX::SULD_3D_V4I16_ZERO_R; 3460 break; 3461 case NVPTXISD::Suld3DV4I32Zero: 3462 Opc = NVPTX::SULD_3D_V4I32_ZERO_R; 3463 break; 3464 } 3465 3466 // Copy over operands 3467 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 3468 Ops.push_back(N->getOperand(0)); // Move chain to the back. 3469 3470 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 3471 return true; 3472 } 3473 3474 3475 /// SelectBFE - Look for instruction sequences that can be made more efficient 3476 /// by using the 'bfe' (bit-field extract) PTX instruction 3477 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { 3478 SDLoc DL(N); 3479 SDValue LHS = N->getOperand(0); 3480 SDValue RHS = N->getOperand(1); 3481 SDValue Len; 3482 SDValue Start; 3483 SDValue Val; 3484 bool IsSigned = false; 3485 3486 if (N->getOpcode() == ISD::AND) { 3487 // Canonicalize the operands 3488 // We want 'and %val, %mask' 3489 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { 3490 std::swap(LHS, RHS); 3491 } 3492 3493 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); 3494 if (!Mask) { 3495 // We need a constant mask on the RHS of the AND 3496 return false; 3497 } 3498 3499 // Extract the mask bits 3500 uint64_t MaskVal = Mask->getZExtValue(); 3501 if (!isMask_64(MaskVal)) { 3502 // We *could* handle shifted masks here, but doing so would require an 3503 // 'and' operation to fix up the low-order bits so we would trade 3504 // shr+and for bfe+and, which has the same throughput 3505 return false; 3506 } 3507 3508 // How many bits are in our mask? 3509 int64_t NumBits = countr_one(MaskVal); 3510 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3511 3512 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { 3513 // We have a 'srl/and' pair, extract the effective start bit and length 3514 Val = LHS.getNode()->getOperand(0); 3515 Start = LHS.getNode()->getOperand(1); 3516 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); 3517 if (StartConst) { 3518 uint64_t StartVal = StartConst->getZExtValue(); 3519 // How many "good" bits do we have left? "good" is defined here as bits 3520 // that exist in the original value, not shifted in. 3521 int64_t GoodBits = Start.getValueSizeInBits() - StartVal; 3522 if (NumBits > GoodBits) { 3523 // Do not handle the case where bits have been shifted in. In theory 3524 // we could handle this, but the cost is likely higher than just 3525 // emitting the srl/and pair. 3526 return false; 3527 } 3528 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); 3529 } else { 3530 // Do not handle the case where the shift amount (can be zero if no srl 3531 // was found) is not constant. We could handle this case, but it would 3532 // require run-time logic that would be more expensive than just 3533 // emitting the srl/and pair. 3534 return false; 3535 } 3536 } else { 3537 // Do not handle the case where the LHS of the and is not a shift. While 3538 // it would be trivial to handle this case, it would just transform 3539 // 'and' -> 'bfe', but 'and' has higher-throughput. 3540 return false; 3541 } 3542 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { 3543 if (LHS->getOpcode() == ISD::AND) { 3544 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); 3545 if (!ShiftCnst) { 3546 // Shift amount must be constant 3547 return false; 3548 } 3549 3550 uint64_t ShiftAmt = ShiftCnst->getZExtValue(); 3551 3552 SDValue AndLHS = LHS->getOperand(0); 3553 SDValue AndRHS = LHS->getOperand(1); 3554 3555 // Canonicalize the AND to have the mask on the RHS 3556 if (isa<ConstantSDNode>(AndLHS)) { 3557 std::swap(AndLHS, AndRHS); 3558 } 3559 3560 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); 3561 if (!MaskCnst) { 3562 // Mask must be constant 3563 return false; 3564 } 3565 3566 uint64_t MaskVal = MaskCnst->getZExtValue(); 3567 uint64_t NumZeros; 3568 uint64_t NumBits; 3569 if (isMask_64(MaskVal)) { 3570 NumZeros = 0; 3571 // The number of bits in the result bitfield will be the number of 3572 // trailing ones (the AND) minus the number of bits we shift off 3573 NumBits = llvm::countr_one(MaskVal) - ShiftAmt; 3574 } else if (isShiftedMask_64(MaskVal)) { 3575 NumZeros = llvm::countr_zero(MaskVal); 3576 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros); 3577 // The number of bits in the result bitfield will be the number of 3578 // trailing zeros plus the number of set bits in the mask minus the 3579 // number of bits we shift off 3580 NumBits = NumZeros + NumOnes - ShiftAmt; 3581 } else { 3582 // This is not a mask we can handle 3583 return false; 3584 } 3585 3586 if (ShiftAmt < NumZeros) { 3587 // Handling this case would require extra logic that would make this 3588 // transformation non-profitable 3589 return false; 3590 } 3591 3592 Val = AndLHS; 3593 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); 3594 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3595 } else if (LHS->getOpcode() == ISD::SHL) { 3596 // Here, we have a pattern like: 3597 // 3598 // (sra (shl val, NN), MM) 3599 // or 3600 // (srl (shl val, NN), MM) 3601 // 3602 // If MM >= NN, we can efficiently optimize this with bfe 3603 Val = LHS->getOperand(0); 3604 3605 SDValue ShlRHS = LHS->getOperand(1); 3606 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); 3607 if (!ShlCnst) { 3608 // Shift amount must be constant 3609 return false; 3610 } 3611 uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); 3612 3613 SDValue ShrRHS = RHS; 3614 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); 3615 if (!ShrCnst) { 3616 // Shift amount must be constant 3617 return false; 3618 } 3619 uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); 3620 3621 // To avoid extra codegen and be profitable, we need Outer >= Inner 3622 if (OuterShiftAmt < InnerShiftAmt) { 3623 return false; 3624 } 3625 3626 // If the outer shift is more than the type size, we have no bitfield to 3627 // extract (since we also check that the inner shift is <= the outer shift 3628 // then this also implies that the inner shift is < the type size) 3629 if (OuterShiftAmt >= Val.getValueSizeInBits()) { 3630 return false; 3631 } 3632 3633 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, 3634 MVT::i32); 3635 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, 3636 DL, MVT::i32); 3637 3638 if (N->getOpcode() == ISD::SRA) { 3639 // If we have a arithmetic right shift, we need to use the signed bfe 3640 // variant 3641 IsSigned = true; 3642 } 3643 } else { 3644 // No can do... 3645 return false; 3646 } 3647 } else { 3648 // No can do... 3649 return false; 3650 } 3651 3652 3653 unsigned Opc; 3654 // For the BFE operations we form here from "and" and "srl", always use the 3655 // unsigned variants. 3656 if (Val.getValueType() == MVT::i32) { 3657 if (IsSigned) { 3658 Opc = NVPTX::BFE_S32rii; 3659 } else { 3660 Opc = NVPTX::BFE_U32rii; 3661 } 3662 } else if (Val.getValueType() == MVT::i64) { 3663 if (IsSigned) { 3664 Opc = NVPTX::BFE_S64rii; 3665 } else { 3666 Opc = NVPTX::BFE_U64rii; 3667 } 3668 } else { 3669 // We cannot handle this type 3670 return false; 3671 } 3672 3673 SDValue Ops[] = { 3674 Val, Start, Len 3675 }; 3676 3677 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); 3678 return true; 3679 } 3680 3681 // SelectDirectAddr - Match a direct address for DAG. 3682 // A direct address could be a globaladdress or externalsymbol. 3683 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { 3684 // Return true if TGA or ES. 3685 if (N.getOpcode() == ISD::TargetGlobalAddress || 3686 N.getOpcode() == ISD::TargetExternalSymbol) { 3687 Address = N; 3688 return true; 3689 } 3690 if (N.getOpcode() == NVPTXISD::Wrapper) { 3691 Address = N.getOperand(0); 3692 return true; 3693 } 3694 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol 3695 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) { 3696 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && 3697 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && 3698 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) 3699 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); 3700 } 3701 return false; 3702 } 3703 3704 // symbol+offset 3705 bool NVPTXDAGToDAGISel::SelectADDRsi_imp( 3706 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3707 if (Addr.getOpcode() == ISD::ADD) { 3708 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3709 SDValue base = Addr.getOperand(0); 3710 if (SelectDirectAddr(base, Base)) { 3711 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3712 mvt); 3713 return true; 3714 } 3715 } 3716 } 3717 return false; 3718 } 3719 3720 // symbol+offset 3721 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, 3722 SDValue &Base, SDValue &Offset) { 3723 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); 3724 } 3725 3726 // symbol+offset 3727 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, 3728 SDValue &Base, SDValue &Offset) { 3729 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); 3730 } 3731 3732 // register+offset 3733 bool NVPTXDAGToDAGISel::SelectADDRri_imp( 3734 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3735 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { 3736 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3737 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt); 3738 return true; 3739 } 3740 if (Addr.getOpcode() == ISD::TargetExternalSymbol || 3741 Addr.getOpcode() == ISD::TargetGlobalAddress) 3742 return false; // direct calls. 3743 3744 if (Addr.getOpcode() == ISD::ADD) { 3745 if (SelectDirectAddr(Addr.getOperand(0), Addr)) { 3746 return false; 3747 } 3748 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3749 if (FrameIndexSDNode *FIN = 3750 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) 3751 // Constant offset from frame ref. 3752 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3753 else 3754 Base = Addr.getOperand(0); 3755 3756 // Offset must fit in a 32-bit signed int in PTX [register+offset] address 3757 // mode 3758 if (!CN->getAPIntValue().isSignedIntN(32)) 3759 return false; 3760 3761 Offset = CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(OpNode), 3762 MVT::i32); 3763 return true; 3764 } 3765 } 3766 return false; 3767 } 3768 3769 // register+offset 3770 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, 3771 SDValue &Base, SDValue &Offset) { 3772 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); 3773 } 3774 3775 // register+offset 3776 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, 3777 SDValue &Base, SDValue &Offset) { 3778 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); 3779 } 3780 3781 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, 3782 unsigned int spN) const { 3783 const Value *Src = nullptr; 3784 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { 3785 if (spN == 0 && mN->getMemOperand()->getPseudoValue()) 3786 return true; 3787 Src = mN->getMemOperand()->getValue(); 3788 } 3789 if (!Src) 3790 return false; 3791 if (auto *PT = dyn_cast<PointerType>(Src->getType())) 3792 return (PT->getAddressSpace() == spN); 3793 return false; 3794 } 3795 3796 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 3797 /// inline asm expressions. 3798 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( 3799 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, 3800 std::vector<SDValue> &OutOps) { 3801 SDValue Op0, Op1; 3802 switch (ConstraintID) { 3803 default: 3804 return true; 3805 case InlineAsm::ConstraintCode::m: // memory 3806 if (SelectDirectAddr(Op, Op0)) { 3807 OutOps.push_back(Op0); 3808 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); 3809 return false; 3810 } 3811 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { 3812 OutOps.push_back(Op0); 3813 OutOps.push_back(Op1); 3814 return false; 3815 } 3816 break; 3817 } 3818 return true; 3819 } 3820 3821 void NVPTXDAGToDAGISel::SelectV2I64toI128(SDNode *N) { 3822 // Lower a CopyToReg with two 64-bit inputs 3823 // Dst:i128, lo:i64, hi:i64 3824 // 3825 // CopyToReg Dst, lo, hi; 3826 // 3827 // ==> 3828 // 3829 // tmp = V2I64toI128 {lo, hi}; 3830 // CopyToReg Dst, tmp; 3831 SDValue Dst = N->getOperand(1); 3832 SDValue Lo = N->getOperand(2); 3833 SDValue Hi = N->getOperand(3); 3834 3835 SDLoc DL(N); 3836 SDNode *Mov = 3837 CurDAG->getMachineNode(NVPTX::V2I64toI128, DL, MVT::i128, {Lo, Hi}); 3838 3839 SmallVector<SDValue, 4> NewOps(N->getNumOperands() - 1); 3840 NewOps[0] = N->getOperand(0); 3841 NewOps[1] = Dst; 3842 NewOps[2] = SDValue(Mov, 0); 3843 if (N->getNumOperands() == 5) 3844 NewOps[3] = N->getOperand(4); 3845 SDValue NewValue = CurDAG->getNode(ISD::CopyToReg, DL, SmallVector<EVT>(N->values()), NewOps); 3846 3847 ReplaceNode(N, NewValue.getNode()); 3848 } 3849 3850 void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) { 3851 // Lower CopyFromReg from a 128-bit regs to two 64-bit regs 3852 // Dst:i128, Src:i128 3853 // 3854 // {lo, hi} = CopyFromReg Src 3855 // 3856 // ==> 3857 // 3858 // {lo, hi} = I128toV2I64 Src 3859 // 3860 SDValue Ch = N->getOperand(0); 3861 SDValue Src = N->getOperand(1); 3862 SDValue Glue = N->getOperand(2); 3863 SDLoc DL(N); 3864 3865 // Add Glue and Ch to the operands and results to avoid break the execution 3866 // order 3867 SDNode *Mov = CurDAG->getMachineNode( 3868 NVPTX::I128toV2I64, DL, 3869 {MVT::i64, MVT::i64, Ch.getValueType(), Glue.getValueType()}, 3870 {Src, Ch, Glue}); 3871 3872 ReplaceNode(N, Mov); 3873 } 3874 3875 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a 3876 /// conversion from \p SrcTy to \p DestTy. 3877 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, 3878 LoadSDNode *LdNode) { 3879 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD; 3880 switch (SrcTy.SimpleTy) { 3881 default: 3882 llvm_unreachable("Unhandled source type"); 3883 case MVT::i8: 3884 switch (DestTy.SimpleTy) { 3885 default: 3886 llvm_unreachable("Unhandled dest type"); 3887 case MVT::i16: 3888 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; 3889 case MVT::i32: 3890 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; 3891 case MVT::i64: 3892 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; 3893 } 3894 case MVT::i16: 3895 switch (DestTy.SimpleTy) { 3896 default: 3897 llvm_unreachable("Unhandled dest type"); 3898 case MVT::i8: 3899 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; 3900 case MVT::i32: 3901 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; 3902 case MVT::i64: 3903 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; 3904 } 3905 case MVT::i32: 3906 switch (DestTy.SimpleTy) { 3907 default: 3908 llvm_unreachable("Unhandled dest type"); 3909 case MVT::i8: 3910 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; 3911 case MVT::i16: 3912 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; 3913 case MVT::i64: 3914 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; 3915 } 3916 case MVT::i64: 3917 switch (DestTy.SimpleTy) { 3918 default: 3919 llvm_unreachable("Unhandled dest type"); 3920 case MVT::i8: 3921 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; 3922 case MVT::i16: 3923 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; 3924 case MVT::i32: 3925 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; 3926 } 3927 case MVT::f16: 3928 switch (DestTy.SimpleTy) { 3929 default: 3930 llvm_unreachable("Unhandled dest type"); 3931 case MVT::f32: 3932 return NVPTX::CVT_f32_f16; 3933 case MVT::f64: 3934 return NVPTX::CVT_f64_f16; 3935 } 3936 } 3937 } 3938