1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines an instruction selector for the NVPTX target. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "NVPTXISelDAGToDAG.h" 14 #include "MCTargetDesc/NVPTXBaseInfo.h" 15 #include "NVPTXUtilities.h" 16 #include "llvm/Analysis/ValueTracking.h" 17 #include "llvm/IR/GlobalValue.h" 18 #include "llvm/IR/Instructions.h" 19 #include "llvm/IR/IntrinsicsNVPTX.h" 20 #include "llvm/Support/AtomicOrdering.h" 21 #include "llvm/Support/CommandLine.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/ErrorHandling.h" 24 #include "llvm/Support/raw_ostream.h" 25 #include "llvm/Target/TargetIntrinsicInfo.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "nvptx-isel" 30 31 /// createNVPTXISelDag - This pass converts a legalized DAG into a 32 /// NVPTX-specific DAG, ready for instruction scheduling. 33 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, 34 llvm::CodeGenOpt::Level OptLevel) { 35 return new NVPTXDAGToDAGISel(TM, OptLevel); 36 } 37 38 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, 39 CodeGenOpt::Level OptLevel) 40 : SelectionDAGISel(tm, OptLevel), TM(tm) { 41 doMulWide = (OptLevel > 0); 42 } 43 44 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { 45 Subtarget = &MF.getSubtarget<NVPTXSubtarget>(); 46 return SelectionDAGISel::runOnMachineFunction(MF); 47 } 48 49 int NVPTXDAGToDAGISel::getDivF32Level() const { 50 return Subtarget->getTargetLowering()->getDivF32Level(); 51 } 52 53 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const { 54 return Subtarget->getTargetLowering()->usePrecSqrtF32(); 55 } 56 57 bool NVPTXDAGToDAGISel::useF32FTZ() const { 58 return Subtarget->getTargetLowering()->useF32FTZ(*MF); 59 } 60 61 bool NVPTXDAGToDAGISel::allowFMA() const { 62 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 63 return TL->allowFMA(*MF, OptLevel); 64 } 65 66 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const { 67 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering(); 68 return TL->allowUnsafeFPMath(*MF); 69 } 70 71 bool NVPTXDAGToDAGISel::useShortPointers() const { 72 return TM.useShortPointers(); 73 } 74 75 /// Select - Select instructions not customized! Used for 76 /// expanded, promoted and normal instructions. 77 void NVPTXDAGToDAGISel::Select(SDNode *N) { 78 79 if (N->isMachineOpcode()) { 80 N->setNodeId(-1); 81 return; // Already selected. 82 } 83 84 switch (N->getOpcode()) { 85 case ISD::LOAD: 86 case ISD::ATOMIC_LOAD: 87 if (tryLoad(N)) 88 return; 89 break; 90 case ISD::STORE: 91 case ISD::ATOMIC_STORE: 92 if (tryStore(N)) 93 return; 94 break; 95 case ISD::EXTRACT_VECTOR_ELT: 96 if (tryEXTRACT_VECTOR_ELEMENT(N)) 97 return; 98 break; 99 case NVPTXISD::SETP_F16X2: 100 SelectSETP_F16X2(N); 101 return; 102 103 case NVPTXISD::LoadV2: 104 case NVPTXISD::LoadV4: 105 if (tryLoadVector(N)) 106 return; 107 break; 108 case NVPTXISD::LDGV2: 109 case NVPTXISD::LDGV4: 110 case NVPTXISD::LDUV2: 111 case NVPTXISD::LDUV4: 112 if (tryLDGLDU(N)) 113 return; 114 break; 115 case NVPTXISD::StoreV2: 116 case NVPTXISD::StoreV4: 117 if (tryStoreVector(N)) 118 return; 119 break; 120 case NVPTXISD::LoadParam: 121 case NVPTXISD::LoadParamV2: 122 case NVPTXISD::LoadParamV4: 123 if (tryLoadParam(N)) 124 return; 125 break; 126 case NVPTXISD::StoreRetval: 127 case NVPTXISD::StoreRetvalV2: 128 case NVPTXISD::StoreRetvalV4: 129 if (tryStoreRetval(N)) 130 return; 131 break; 132 case NVPTXISD::StoreParam: 133 case NVPTXISD::StoreParamV2: 134 case NVPTXISD::StoreParamV4: 135 case NVPTXISD::StoreParamS32: 136 case NVPTXISD::StoreParamU32: 137 if (tryStoreParam(N)) 138 return; 139 break; 140 case ISD::INTRINSIC_WO_CHAIN: 141 if (tryIntrinsicNoChain(N)) 142 return; 143 break; 144 case ISD::INTRINSIC_W_CHAIN: 145 if (tryIntrinsicChain(N)) 146 return; 147 break; 148 case NVPTXISD::Tex1DFloatS32: 149 case NVPTXISD::Tex1DFloatFloat: 150 case NVPTXISD::Tex1DFloatFloatLevel: 151 case NVPTXISD::Tex1DFloatFloatGrad: 152 case NVPTXISD::Tex1DS32S32: 153 case NVPTXISD::Tex1DS32Float: 154 case NVPTXISD::Tex1DS32FloatLevel: 155 case NVPTXISD::Tex1DS32FloatGrad: 156 case NVPTXISD::Tex1DU32S32: 157 case NVPTXISD::Tex1DU32Float: 158 case NVPTXISD::Tex1DU32FloatLevel: 159 case NVPTXISD::Tex1DU32FloatGrad: 160 case NVPTXISD::Tex1DArrayFloatS32: 161 case NVPTXISD::Tex1DArrayFloatFloat: 162 case NVPTXISD::Tex1DArrayFloatFloatLevel: 163 case NVPTXISD::Tex1DArrayFloatFloatGrad: 164 case NVPTXISD::Tex1DArrayS32S32: 165 case NVPTXISD::Tex1DArrayS32Float: 166 case NVPTXISD::Tex1DArrayS32FloatLevel: 167 case NVPTXISD::Tex1DArrayS32FloatGrad: 168 case NVPTXISD::Tex1DArrayU32S32: 169 case NVPTXISD::Tex1DArrayU32Float: 170 case NVPTXISD::Tex1DArrayU32FloatLevel: 171 case NVPTXISD::Tex1DArrayU32FloatGrad: 172 case NVPTXISD::Tex2DFloatS32: 173 case NVPTXISD::Tex2DFloatFloat: 174 case NVPTXISD::Tex2DFloatFloatLevel: 175 case NVPTXISD::Tex2DFloatFloatGrad: 176 case NVPTXISD::Tex2DS32S32: 177 case NVPTXISD::Tex2DS32Float: 178 case NVPTXISD::Tex2DS32FloatLevel: 179 case NVPTXISD::Tex2DS32FloatGrad: 180 case NVPTXISD::Tex2DU32S32: 181 case NVPTXISD::Tex2DU32Float: 182 case NVPTXISD::Tex2DU32FloatLevel: 183 case NVPTXISD::Tex2DU32FloatGrad: 184 case NVPTXISD::Tex2DArrayFloatS32: 185 case NVPTXISD::Tex2DArrayFloatFloat: 186 case NVPTXISD::Tex2DArrayFloatFloatLevel: 187 case NVPTXISD::Tex2DArrayFloatFloatGrad: 188 case NVPTXISD::Tex2DArrayS32S32: 189 case NVPTXISD::Tex2DArrayS32Float: 190 case NVPTXISD::Tex2DArrayS32FloatLevel: 191 case NVPTXISD::Tex2DArrayS32FloatGrad: 192 case NVPTXISD::Tex2DArrayU32S32: 193 case NVPTXISD::Tex2DArrayU32Float: 194 case NVPTXISD::Tex2DArrayU32FloatLevel: 195 case NVPTXISD::Tex2DArrayU32FloatGrad: 196 case NVPTXISD::Tex3DFloatS32: 197 case NVPTXISD::Tex3DFloatFloat: 198 case NVPTXISD::Tex3DFloatFloatLevel: 199 case NVPTXISD::Tex3DFloatFloatGrad: 200 case NVPTXISD::Tex3DS32S32: 201 case NVPTXISD::Tex3DS32Float: 202 case NVPTXISD::Tex3DS32FloatLevel: 203 case NVPTXISD::Tex3DS32FloatGrad: 204 case NVPTXISD::Tex3DU32S32: 205 case NVPTXISD::Tex3DU32Float: 206 case NVPTXISD::Tex3DU32FloatLevel: 207 case NVPTXISD::Tex3DU32FloatGrad: 208 case NVPTXISD::TexCubeFloatFloat: 209 case NVPTXISD::TexCubeFloatFloatLevel: 210 case NVPTXISD::TexCubeS32Float: 211 case NVPTXISD::TexCubeS32FloatLevel: 212 case NVPTXISD::TexCubeU32Float: 213 case NVPTXISD::TexCubeU32FloatLevel: 214 case NVPTXISD::TexCubeArrayFloatFloat: 215 case NVPTXISD::TexCubeArrayFloatFloatLevel: 216 case NVPTXISD::TexCubeArrayS32Float: 217 case NVPTXISD::TexCubeArrayS32FloatLevel: 218 case NVPTXISD::TexCubeArrayU32Float: 219 case NVPTXISD::TexCubeArrayU32FloatLevel: 220 case NVPTXISD::Tld4R2DFloatFloat: 221 case NVPTXISD::Tld4G2DFloatFloat: 222 case NVPTXISD::Tld4B2DFloatFloat: 223 case NVPTXISD::Tld4A2DFloatFloat: 224 case NVPTXISD::Tld4R2DS64Float: 225 case NVPTXISD::Tld4G2DS64Float: 226 case NVPTXISD::Tld4B2DS64Float: 227 case NVPTXISD::Tld4A2DS64Float: 228 case NVPTXISD::Tld4R2DU64Float: 229 case NVPTXISD::Tld4G2DU64Float: 230 case NVPTXISD::Tld4B2DU64Float: 231 case NVPTXISD::Tld4A2DU64Float: 232 case NVPTXISD::TexUnified1DFloatS32: 233 case NVPTXISD::TexUnified1DFloatFloat: 234 case NVPTXISD::TexUnified1DFloatFloatLevel: 235 case NVPTXISD::TexUnified1DFloatFloatGrad: 236 case NVPTXISD::TexUnified1DS32S32: 237 case NVPTXISD::TexUnified1DS32Float: 238 case NVPTXISD::TexUnified1DS32FloatLevel: 239 case NVPTXISD::TexUnified1DS32FloatGrad: 240 case NVPTXISD::TexUnified1DU32S32: 241 case NVPTXISD::TexUnified1DU32Float: 242 case NVPTXISD::TexUnified1DU32FloatLevel: 243 case NVPTXISD::TexUnified1DU32FloatGrad: 244 case NVPTXISD::TexUnified1DArrayFloatS32: 245 case NVPTXISD::TexUnified1DArrayFloatFloat: 246 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 247 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 248 case NVPTXISD::TexUnified1DArrayS32S32: 249 case NVPTXISD::TexUnified1DArrayS32Float: 250 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 251 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 252 case NVPTXISD::TexUnified1DArrayU32S32: 253 case NVPTXISD::TexUnified1DArrayU32Float: 254 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 255 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 256 case NVPTXISD::TexUnified2DFloatS32: 257 case NVPTXISD::TexUnified2DFloatFloat: 258 case NVPTXISD::TexUnified2DFloatFloatLevel: 259 case NVPTXISD::TexUnified2DFloatFloatGrad: 260 case NVPTXISD::TexUnified2DS32S32: 261 case NVPTXISD::TexUnified2DS32Float: 262 case NVPTXISD::TexUnified2DS32FloatLevel: 263 case NVPTXISD::TexUnified2DS32FloatGrad: 264 case NVPTXISD::TexUnified2DU32S32: 265 case NVPTXISD::TexUnified2DU32Float: 266 case NVPTXISD::TexUnified2DU32FloatLevel: 267 case NVPTXISD::TexUnified2DU32FloatGrad: 268 case NVPTXISD::TexUnified2DArrayFloatS32: 269 case NVPTXISD::TexUnified2DArrayFloatFloat: 270 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 271 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 272 case NVPTXISD::TexUnified2DArrayS32S32: 273 case NVPTXISD::TexUnified2DArrayS32Float: 274 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 275 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 276 case NVPTXISD::TexUnified2DArrayU32S32: 277 case NVPTXISD::TexUnified2DArrayU32Float: 278 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 279 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 280 case NVPTXISD::TexUnified3DFloatS32: 281 case NVPTXISD::TexUnified3DFloatFloat: 282 case NVPTXISD::TexUnified3DFloatFloatLevel: 283 case NVPTXISD::TexUnified3DFloatFloatGrad: 284 case NVPTXISD::TexUnified3DS32S32: 285 case NVPTXISD::TexUnified3DS32Float: 286 case NVPTXISD::TexUnified3DS32FloatLevel: 287 case NVPTXISD::TexUnified3DS32FloatGrad: 288 case NVPTXISD::TexUnified3DU32S32: 289 case NVPTXISD::TexUnified3DU32Float: 290 case NVPTXISD::TexUnified3DU32FloatLevel: 291 case NVPTXISD::TexUnified3DU32FloatGrad: 292 case NVPTXISD::TexUnifiedCubeFloatFloat: 293 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 294 case NVPTXISD::TexUnifiedCubeS32Float: 295 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 296 case NVPTXISD::TexUnifiedCubeU32Float: 297 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 298 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 299 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 300 case NVPTXISD::TexUnifiedCubeArrayS32Float: 301 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 302 case NVPTXISD::TexUnifiedCubeArrayU32Float: 303 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 304 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 305 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 306 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 307 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 308 case NVPTXISD::Tld4UnifiedR2DS64Float: 309 case NVPTXISD::Tld4UnifiedG2DS64Float: 310 case NVPTXISD::Tld4UnifiedB2DS64Float: 311 case NVPTXISD::Tld4UnifiedA2DS64Float: 312 case NVPTXISD::Tld4UnifiedR2DU64Float: 313 case NVPTXISD::Tld4UnifiedG2DU64Float: 314 case NVPTXISD::Tld4UnifiedB2DU64Float: 315 case NVPTXISD::Tld4UnifiedA2DU64Float: 316 if (tryTextureIntrinsic(N)) 317 return; 318 break; 319 case NVPTXISD::Suld1DI8Clamp: 320 case NVPTXISD::Suld1DI16Clamp: 321 case NVPTXISD::Suld1DI32Clamp: 322 case NVPTXISD::Suld1DI64Clamp: 323 case NVPTXISD::Suld1DV2I8Clamp: 324 case NVPTXISD::Suld1DV2I16Clamp: 325 case NVPTXISD::Suld1DV2I32Clamp: 326 case NVPTXISD::Suld1DV2I64Clamp: 327 case NVPTXISD::Suld1DV4I8Clamp: 328 case NVPTXISD::Suld1DV4I16Clamp: 329 case NVPTXISD::Suld1DV4I32Clamp: 330 case NVPTXISD::Suld1DArrayI8Clamp: 331 case NVPTXISD::Suld1DArrayI16Clamp: 332 case NVPTXISD::Suld1DArrayI32Clamp: 333 case NVPTXISD::Suld1DArrayI64Clamp: 334 case NVPTXISD::Suld1DArrayV2I8Clamp: 335 case NVPTXISD::Suld1DArrayV2I16Clamp: 336 case NVPTXISD::Suld1DArrayV2I32Clamp: 337 case NVPTXISD::Suld1DArrayV2I64Clamp: 338 case NVPTXISD::Suld1DArrayV4I8Clamp: 339 case NVPTXISD::Suld1DArrayV4I16Clamp: 340 case NVPTXISD::Suld1DArrayV4I32Clamp: 341 case NVPTXISD::Suld2DI8Clamp: 342 case NVPTXISD::Suld2DI16Clamp: 343 case NVPTXISD::Suld2DI32Clamp: 344 case NVPTXISD::Suld2DI64Clamp: 345 case NVPTXISD::Suld2DV2I8Clamp: 346 case NVPTXISD::Suld2DV2I16Clamp: 347 case NVPTXISD::Suld2DV2I32Clamp: 348 case NVPTXISD::Suld2DV2I64Clamp: 349 case NVPTXISD::Suld2DV4I8Clamp: 350 case NVPTXISD::Suld2DV4I16Clamp: 351 case NVPTXISD::Suld2DV4I32Clamp: 352 case NVPTXISD::Suld2DArrayI8Clamp: 353 case NVPTXISD::Suld2DArrayI16Clamp: 354 case NVPTXISD::Suld2DArrayI32Clamp: 355 case NVPTXISD::Suld2DArrayI64Clamp: 356 case NVPTXISD::Suld2DArrayV2I8Clamp: 357 case NVPTXISD::Suld2DArrayV2I16Clamp: 358 case NVPTXISD::Suld2DArrayV2I32Clamp: 359 case NVPTXISD::Suld2DArrayV2I64Clamp: 360 case NVPTXISD::Suld2DArrayV4I8Clamp: 361 case NVPTXISD::Suld2DArrayV4I16Clamp: 362 case NVPTXISD::Suld2DArrayV4I32Clamp: 363 case NVPTXISD::Suld3DI8Clamp: 364 case NVPTXISD::Suld3DI16Clamp: 365 case NVPTXISD::Suld3DI32Clamp: 366 case NVPTXISD::Suld3DI64Clamp: 367 case NVPTXISD::Suld3DV2I8Clamp: 368 case NVPTXISD::Suld3DV2I16Clamp: 369 case NVPTXISD::Suld3DV2I32Clamp: 370 case NVPTXISD::Suld3DV2I64Clamp: 371 case NVPTXISD::Suld3DV4I8Clamp: 372 case NVPTXISD::Suld3DV4I16Clamp: 373 case NVPTXISD::Suld3DV4I32Clamp: 374 case NVPTXISD::Suld1DI8Trap: 375 case NVPTXISD::Suld1DI16Trap: 376 case NVPTXISD::Suld1DI32Trap: 377 case NVPTXISD::Suld1DI64Trap: 378 case NVPTXISD::Suld1DV2I8Trap: 379 case NVPTXISD::Suld1DV2I16Trap: 380 case NVPTXISD::Suld1DV2I32Trap: 381 case NVPTXISD::Suld1DV2I64Trap: 382 case NVPTXISD::Suld1DV4I8Trap: 383 case NVPTXISD::Suld1DV4I16Trap: 384 case NVPTXISD::Suld1DV4I32Trap: 385 case NVPTXISD::Suld1DArrayI8Trap: 386 case NVPTXISD::Suld1DArrayI16Trap: 387 case NVPTXISD::Suld1DArrayI32Trap: 388 case NVPTXISD::Suld1DArrayI64Trap: 389 case NVPTXISD::Suld1DArrayV2I8Trap: 390 case NVPTXISD::Suld1DArrayV2I16Trap: 391 case NVPTXISD::Suld1DArrayV2I32Trap: 392 case NVPTXISD::Suld1DArrayV2I64Trap: 393 case NVPTXISD::Suld1DArrayV4I8Trap: 394 case NVPTXISD::Suld1DArrayV4I16Trap: 395 case NVPTXISD::Suld1DArrayV4I32Trap: 396 case NVPTXISD::Suld2DI8Trap: 397 case NVPTXISD::Suld2DI16Trap: 398 case NVPTXISD::Suld2DI32Trap: 399 case NVPTXISD::Suld2DI64Trap: 400 case NVPTXISD::Suld2DV2I8Trap: 401 case NVPTXISD::Suld2DV2I16Trap: 402 case NVPTXISD::Suld2DV2I32Trap: 403 case NVPTXISD::Suld2DV2I64Trap: 404 case NVPTXISD::Suld2DV4I8Trap: 405 case NVPTXISD::Suld2DV4I16Trap: 406 case NVPTXISD::Suld2DV4I32Trap: 407 case NVPTXISD::Suld2DArrayI8Trap: 408 case NVPTXISD::Suld2DArrayI16Trap: 409 case NVPTXISD::Suld2DArrayI32Trap: 410 case NVPTXISD::Suld2DArrayI64Trap: 411 case NVPTXISD::Suld2DArrayV2I8Trap: 412 case NVPTXISD::Suld2DArrayV2I16Trap: 413 case NVPTXISD::Suld2DArrayV2I32Trap: 414 case NVPTXISD::Suld2DArrayV2I64Trap: 415 case NVPTXISD::Suld2DArrayV4I8Trap: 416 case NVPTXISD::Suld2DArrayV4I16Trap: 417 case NVPTXISD::Suld2DArrayV4I32Trap: 418 case NVPTXISD::Suld3DI8Trap: 419 case NVPTXISD::Suld3DI16Trap: 420 case NVPTXISD::Suld3DI32Trap: 421 case NVPTXISD::Suld3DI64Trap: 422 case NVPTXISD::Suld3DV2I8Trap: 423 case NVPTXISD::Suld3DV2I16Trap: 424 case NVPTXISD::Suld3DV2I32Trap: 425 case NVPTXISD::Suld3DV2I64Trap: 426 case NVPTXISD::Suld3DV4I8Trap: 427 case NVPTXISD::Suld3DV4I16Trap: 428 case NVPTXISD::Suld3DV4I32Trap: 429 case NVPTXISD::Suld1DI8Zero: 430 case NVPTXISD::Suld1DI16Zero: 431 case NVPTXISD::Suld1DI32Zero: 432 case NVPTXISD::Suld1DI64Zero: 433 case NVPTXISD::Suld1DV2I8Zero: 434 case NVPTXISD::Suld1DV2I16Zero: 435 case NVPTXISD::Suld1DV2I32Zero: 436 case NVPTXISD::Suld1DV2I64Zero: 437 case NVPTXISD::Suld1DV4I8Zero: 438 case NVPTXISD::Suld1DV4I16Zero: 439 case NVPTXISD::Suld1DV4I32Zero: 440 case NVPTXISD::Suld1DArrayI8Zero: 441 case NVPTXISD::Suld1DArrayI16Zero: 442 case NVPTXISD::Suld1DArrayI32Zero: 443 case NVPTXISD::Suld1DArrayI64Zero: 444 case NVPTXISD::Suld1DArrayV2I8Zero: 445 case NVPTXISD::Suld1DArrayV2I16Zero: 446 case NVPTXISD::Suld1DArrayV2I32Zero: 447 case NVPTXISD::Suld1DArrayV2I64Zero: 448 case NVPTXISD::Suld1DArrayV4I8Zero: 449 case NVPTXISD::Suld1DArrayV4I16Zero: 450 case NVPTXISD::Suld1DArrayV4I32Zero: 451 case NVPTXISD::Suld2DI8Zero: 452 case NVPTXISD::Suld2DI16Zero: 453 case NVPTXISD::Suld2DI32Zero: 454 case NVPTXISD::Suld2DI64Zero: 455 case NVPTXISD::Suld2DV2I8Zero: 456 case NVPTXISD::Suld2DV2I16Zero: 457 case NVPTXISD::Suld2DV2I32Zero: 458 case NVPTXISD::Suld2DV2I64Zero: 459 case NVPTXISD::Suld2DV4I8Zero: 460 case NVPTXISD::Suld2DV4I16Zero: 461 case NVPTXISD::Suld2DV4I32Zero: 462 case NVPTXISD::Suld2DArrayI8Zero: 463 case NVPTXISD::Suld2DArrayI16Zero: 464 case NVPTXISD::Suld2DArrayI32Zero: 465 case NVPTXISD::Suld2DArrayI64Zero: 466 case NVPTXISD::Suld2DArrayV2I8Zero: 467 case NVPTXISD::Suld2DArrayV2I16Zero: 468 case NVPTXISD::Suld2DArrayV2I32Zero: 469 case NVPTXISD::Suld2DArrayV2I64Zero: 470 case NVPTXISD::Suld2DArrayV4I8Zero: 471 case NVPTXISD::Suld2DArrayV4I16Zero: 472 case NVPTXISD::Suld2DArrayV4I32Zero: 473 case NVPTXISD::Suld3DI8Zero: 474 case NVPTXISD::Suld3DI16Zero: 475 case NVPTXISD::Suld3DI32Zero: 476 case NVPTXISD::Suld3DI64Zero: 477 case NVPTXISD::Suld3DV2I8Zero: 478 case NVPTXISD::Suld3DV2I16Zero: 479 case NVPTXISD::Suld3DV2I32Zero: 480 case NVPTXISD::Suld3DV2I64Zero: 481 case NVPTXISD::Suld3DV4I8Zero: 482 case NVPTXISD::Suld3DV4I16Zero: 483 case NVPTXISD::Suld3DV4I32Zero: 484 if (trySurfaceIntrinsic(N)) 485 return; 486 break; 487 case ISD::AND: 488 case ISD::SRA: 489 case ISD::SRL: 490 // Try to select BFE 491 if (tryBFE(N)) 492 return; 493 break; 494 case ISD::ADDRSPACECAST: 495 SelectAddrSpaceCast(N); 496 return; 497 case ISD::ConstantFP: 498 if (tryConstantFP16(N)) 499 return; 500 break; 501 default: 502 break; 503 } 504 SelectCode(N); 505 } 506 507 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) { 508 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 509 switch (IID) { 510 default: 511 return false; 512 case Intrinsic::nvvm_ldg_global_f: 513 case Intrinsic::nvvm_ldg_global_i: 514 case Intrinsic::nvvm_ldg_global_p: 515 case Intrinsic::nvvm_ldu_global_f: 516 case Intrinsic::nvvm_ldu_global_i: 517 case Intrinsic::nvvm_ldu_global_p: 518 return tryLDGLDU(N); 519 } 520 } 521 522 // There's no way to specify FP16 immediates in .f16 ops, so we have to 523 // load them into an .f16 register first. 524 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) { 525 if (N->getValueType(0) != MVT::f16) 526 return false; 527 SDValue Val = CurDAG->getTargetConstantFP( 528 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16); 529 SDNode *LoadConstF16 = 530 CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val); 531 ReplaceNode(N, LoadConstF16); 532 return true; 533 } 534 535 // Map ISD:CONDCODE value to appropriate CmpMode expected by 536 // NVPTXInstPrinter::printCmpMode() 537 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) { 538 using NVPTX::PTXCmpMode::CmpMode; 539 unsigned PTXCmpMode = [](ISD::CondCode CC) { 540 switch (CC) { 541 default: 542 llvm_unreachable("Unexpected condition code."); 543 case ISD::SETOEQ: 544 return CmpMode::EQ; 545 case ISD::SETOGT: 546 return CmpMode::GT; 547 case ISD::SETOGE: 548 return CmpMode::GE; 549 case ISD::SETOLT: 550 return CmpMode::LT; 551 case ISD::SETOLE: 552 return CmpMode::LE; 553 case ISD::SETONE: 554 return CmpMode::NE; 555 case ISD::SETO: 556 return CmpMode::NUM; 557 case ISD::SETUO: 558 return CmpMode::NotANumber; 559 case ISD::SETUEQ: 560 return CmpMode::EQU; 561 case ISD::SETUGT: 562 return CmpMode::GTU; 563 case ISD::SETUGE: 564 return CmpMode::GEU; 565 case ISD::SETULT: 566 return CmpMode::LTU; 567 case ISD::SETULE: 568 return CmpMode::LEU; 569 case ISD::SETUNE: 570 return CmpMode::NEU; 571 case ISD::SETEQ: 572 return CmpMode::EQ; 573 case ISD::SETGT: 574 return CmpMode::GT; 575 case ISD::SETGE: 576 return CmpMode::GE; 577 case ISD::SETLT: 578 return CmpMode::LT; 579 case ISD::SETLE: 580 return CmpMode::LE; 581 case ISD::SETNE: 582 return CmpMode::NE; 583 } 584 }(CondCode.get()); 585 586 if (FTZ) 587 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG; 588 589 return PTXCmpMode; 590 } 591 592 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) { 593 unsigned PTXCmpMode = 594 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ()); 595 SDLoc DL(N); 596 SDNode *SetP = CurDAG->getMachineNode( 597 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0), 598 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32)); 599 ReplaceNode(N, SetP); 600 return true; 601 } 602 603 // Find all instances of extract_vector_elt that use this v2f16 vector 604 // and coalesce them into a scattering move instruction. 605 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { 606 SDValue Vector = N->getOperand(0); 607 608 // We only care about f16x2 as it's the only real vector type we 609 // need to deal with. 610 if (Vector.getSimpleValueType() != MVT::v2f16) 611 return false; 612 613 // Find and record all uses of this vector that extract element 0 or 1. 614 SmallVector<SDNode *, 4> E0, E1; 615 for (auto U : Vector.getNode()->uses()) { 616 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 617 continue; 618 if (U->getOperand(0) != Vector) 619 continue; 620 if (const ConstantSDNode *IdxConst = 621 dyn_cast<ConstantSDNode>(U->getOperand(1))) { 622 if (IdxConst->getZExtValue() == 0) 623 E0.push_back(U); 624 else if (IdxConst->getZExtValue() == 1) 625 E1.push_back(U); 626 else 627 llvm_unreachable("Invalid vector index."); 628 } 629 } 630 631 // There's no point scattering f16x2 if we only ever access one 632 // element of it. 633 if (E0.empty() || E1.empty()) 634 return false; 635 636 unsigned Op = NVPTX::SplitF16x2; 637 // If the vector has been BITCAST'ed from i32, we can use original 638 // value directly and avoid register-to-register move. 639 SDValue Source = Vector; 640 if (Vector->getOpcode() == ISD::BITCAST) { 641 Op = NVPTX::SplitI32toF16x2; 642 Source = Vector->getOperand(0); 643 } 644 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) 645 // into f16,f16 SplitF16x2(V) 646 SDNode *ScatterOp = 647 CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source); 648 for (auto *Node : E0) 649 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); 650 for (auto *Node : E1) 651 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1)); 652 653 return true; 654 } 655 656 static unsigned int getCodeAddrSpace(MemSDNode *N) { 657 const Value *Src = N->getMemOperand()->getValue(); 658 659 if (!Src) 660 return NVPTX::PTXLdStInstCode::GENERIC; 661 662 if (auto *PT = dyn_cast<PointerType>(Src->getType())) { 663 switch (PT->getAddressSpace()) { 664 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; 665 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; 666 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED; 667 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC; 668 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM; 669 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT; 670 default: break; 671 } 672 } 673 return NVPTX::PTXLdStInstCode::GENERIC; 674 } 675 676 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, 677 unsigned CodeAddrSpace, MachineFunction *F) { 678 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address 679 // space. 680 // 681 // We have two ways of identifying invariant loads: Loads may be explicitly 682 // marked as invariant, or we may infer them to be invariant. 683 // 684 // We currently infer invariance for loads from 685 // - constant global variables, and 686 // - kernel function pointer params that are noalias (i.e. __restrict) and 687 // never written to. 688 // 689 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally 690 // not during the SelectionDAG phase). 691 // 692 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for 693 // explicitly invariant loads because these are how clang tells us to use ldg 694 // when the user uses a builtin. 695 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) 696 return false; 697 698 if (N->isInvariant()) 699 return true; 700 701 bool IsKernelFn = isKernelFunction(F->getFunction()); 702 703 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly 704 // because the former looks through phi nodes while the latter does not. We 705 // need to look through phi nodes to handle pointer induction variables. 706 SmallVector<const Value *, 8> Objs; 707 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs); 708 709 return all_of(Objs, [&](const Value *V) { 710 if (auto *A = dyn_cast<const Argument>(V)) 711 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr(); 712 if (auto *GV = dyn_cast<const GlobalVariable>(V)) 713 return GV->isConstant(); 714 return false; 715 }); 716 } 717 718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) { 719 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 720 switch (IID) { 721 default: 722 return false; 723 case Intrinsic::nvvm_texsurf_handle_internal: 724 SelectTexSurfHandle(N); 725 return true; 726 } 727 } 728 729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) { 730 // Op 0 is the intrinsic ID 731 SDValue Wrapper = N->getOperand(1); 732 SDValue GlobalVal = Wrapper.getOperand(0); 733 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), 734 MVT::i64, GlobalVal)); 735 } 736 737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { 738 SDValue Src = N->getOperand(0); 739 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); 740 unsigned SrcAddrSpace = CastN->getSrcAddressSpace(); 741 unsigned DstAddrSpace = CastN->getDestAddressSpace(); 742 assert(SrcAddrSpace != DstAddrSpace && 743 "addrspacecast must be between different address spaces"); 744 745 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) { 746 // Specific to generic 747 unsigned Opc; 748 switch (SrcAddrSpace) { 749 default: report_fatal_error("Bad address space in addrspacecast"); 750 case ADDRESS_SPACE_GLOBAL: 751 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes; 752 break; 753 case ADDRESS_SPACE_SHARED: 754 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432 755 : NVPTX::cvta_shared_yes_64) 756 : NVPTX::cvta_shared_yes; 757 break; 758 case ADDRESS_SPACE_CONST: 759 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432 760 : NVPTX::cvta_const_yes_64) 761 : NVPTX::cvta_const_yes; 762 break; 763 case ADDRESS_SPACE_LOCAL: 764 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432 765 : NVPTX::cvta_local_yes_64) 766 : NVPTX::cvta_local_yes; 767 break; 768 } 769 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 770 Src)); 771 return; 772 } else { 773 // Generic to specific 774 if (SrcAddrSpace != 0) 775 report_fatal_error("Cannot cast between two non-generic address spaces"); 776 unsigned Opc; 777 switch (DstAddrSpace) { 778 default: report_fatal_error("Bad address space in addrspacecast"); 779 case ADDRESS_SPACE_GLOBAL: 780 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64 781 : NVPTX::cvta_to_global_yes; 782 break; 783 case ADDRESS_SPACE_SHARED: 784 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264 785 : NVPTX::cvta_to_shared_yes_64) 786 : NVPTX::cvta_to_shared_yes; 787 break; 788 case ADDRESS_SPACE_CONST: 789 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264 790 : NVPTX::cvta_to_const_yes_64) 791 : NVPTX::cvta_to_const_yes; 792 break; 793 case ADDRESS_SPACE_LOCAL: 794 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264 795 : NVPTX::cvta_to_local_yes_64) 796 : NVPTX::cvta_to_local_yes; 797 break; 798 case ADDRESS_SPACE_PARAM: 799 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 800 : NVPTX::nvvm_ptr_gen_to_param; 801 break; 802 } 803 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0), 804 Src)); 805 return; 806 } 807 } 808 809 // Helper function template to reduce amount of boilerplate code for 810 // opcode selection. 811 static Optional<unsigned> pickOpcodeForVT( 812 MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, 813 unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16, 814 unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) { 815 switch (VT) { 816 case MVT::i1: 817 case MVT::i8: 818 return Opcode_i8; 819 case MVT::i16: 820 return Opcode_i16; 821 case MVT::i32: 822 return Opcode_i32; 823 case MVT::i64: 824 return Opcode_i64; 825 case MVT::f16: 826 return Opcode_f16; 827 case MVT::v2f16: 828 return Opcode_f16x2; 829 case MVT::f32: 830 return Opcode_f32; 831 case MVT::f64: 832 return Opcode_f64; 833 default: 834 return None; 835 } 836 } 837 838 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { 839 SDLoc dl(N); 840 MemSDNode *LD = cast<MemSDNode>(N); 841 assert(LD->readMem() && "Expected load"); 842 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N); 843 EVT LoadedVT = LD->getMemoryVT(); 844 SDNode *NVPTXLD = nullptr; 845 846 // do not support pre/post inc/dec 847 if (PlainLoad && PlainLoad->isIndexed()) 848 return false; 849 850 if (!LoadedVT.isSimple()) 851 return false; 852 853 AtomicOrdering Ordering = LD->getSuccessOrdering(); 854 // In order to lower atomic loads with stronger guarantees we would need to 855 // use load.acquire or insert fences. However these features were only added 856 // with PTX ISA 6.0 / sm_70. 857 // TODO: Check if we can actually use the new instructions and implement them. 858 if (isStrongerThanMonotonic(Ordering)) 859 return false; 860 861 // Address Space Setting 862 unsigned int CodeAddrSpace = getCodeAddrSpace(LD); 863 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) { 864 return tryLDGLDU(N); 865 } 866 867 unsigned int PointerSize = 868 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace()); 869 870 // Volatile Setting 871 // - .volatile is only available for .global and .shared 872 // - .volatile has the same memory synchronization semantics as .relaxed.sys 873 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic; 874 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 875 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 876 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 877 isVolatile = false; 878 879 // Type Setting: fromType + fromTypeWidth 880 // 881 // Sign : ISD::SEXTLOAD 882 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 883 // type is integer 884 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 885 MVT SimpleVT = LoadedVT.getSimpleVT(); 886 MVT ScalarVT = SimpleVT.getScalarType(); 887 // Read at least 8 bits (predicates are stored as 8-bit values) 888 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 889 unsigned int fromType; 890 891 // Vector Setting 892 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 893 if (SimpleVT.isVector()) { 894 assert(LoadedVT == MVT::v2f16 && "Unexpected vector type"); 895 // v2f16 is loaded using ld.b32 896 fromTypeWidth = 32; 897 } 898 899 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) 900 fromType = NVPTX::PTXLdStInstCode::Signed; 901 else if (ScalarVT.isFloatingPoint()) 902 // f16 uses .b16 as its storage type. 903 fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped 904 : NVPTX::PTXLdStInstCode::Float; 905 else 906 fromType = NVPTX::PTXLdStInstCode::Unsigned; 907 908 // Create the machine instruction DAG 909 SDValue Chain = N->getOperand(0); 910 SDValue N1 = N->getOperand(1); 911 SDValue Addr; 912 SDValue Offset, Base; 913 Optional<unsigned> Opcode; 914 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy; 915 916 if (SelectDirectAddr(N1, Addr)) { 917 Opcode = pickOpcodeForVT( 918 TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar, 919 NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar, 920 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar); 921 if (!Opcode) 922 return false; 923 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 924 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 925 getI32Imm(fromTypeWidth, dl), Addr, Chain }; 926 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 927 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset) 928 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) { 929 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi, 930 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi, 931 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi, 932 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi); 933 if (!Opcode) 934 return false; 935 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 936 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 937 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 938 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 939 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset) 940 : SelectADDRri(N1.getNode(), N1, Base, Offset)) { 941 if (PointerSize == 64) 942 Opcode = pickOpcodeForVT( 943 TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64, 944 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64, 945 NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64); 946 else 947 Opcode = pickOpcodeForVT( 948 TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari, 949 NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari, 950 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari); 951 if (!Opcode) 952 return false; 953 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 954 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 955 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain }; 956 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 957 } else { 958 if (PointerSize == 64) 959 Opcode = pickOpcodeForVT( 960 TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64, 961 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64, 962 NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64, 963 NVPTX::LD_f64_areg_64); 964 else 965 Opcode = pickOpcodeForVT( 966 TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg, 967 NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg, 968 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg); 969 if (!Opcode) 970 return false; 971 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl), 972 getI32Imm(vecType, dl), getI32Imm(fromType, dl), 973 getI32Imm(fromTypeWidth, dl), N1, Chain }; 974 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops); 975 } 976 977 if (!NVPTXLD) 978 return false; 979 980 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 981 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef}); 982 983 ReplaceNode(N, NVPTXLD); 984 return true; 985 } 986 987 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { 988 989 SDValue Chain = N->getOperand(0); 990 SDValue Op1 = N->getOperand(1); 991 SDValue Addr, Offset, Base; 992 Optional<unsigned> Opcode; 993 SDLoc DL(N); 994 SDNode *LD; 995 MemSDNode *MemSD = cast<MemSDNode>(N); 996 EVT LoadedVT = MemSD->getMemoryVT(); 997 998 if (!LoadedVT.isSimple()) 999 return false; 1000 1001 // Address Space Setting 1002 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); 1003 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { 1004 return tryLDGLDU(N); 1005 } 1006 1007 unsigned int PointerSize = 1008 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1009 1010 // Volatile Setting 1011 // - .volatile is only availalble for .global and .shared 1012 bool IsVolatile = MemSD->isVolatile(); 1013 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1014 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1015 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1016 IsVolatile = false; 1017 1018 // Vector Setting 1019 MVT SimpleVT = LoadedVT.getSimpleVT(); 1020 1021 // Type Setting: fromType + fromTypeWidth 1022 // 1023 // Sign : ISD::SEXTLOAD 1024 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the 1025 // type is integer 1026 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float 1027 MVT ScalarVT = SimpleVT.getScalarType(); 1028 // Read at least 8 bits (predicates are stored as 8-bit values) 1029 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); 1030 unsigned int FromType; 1031 // The last operand holds the original LoadSDNode::getExtensionType() value 1032 unsigned ExtensionType = cast<ConstantSDNode>( 1033 N->getOperand(N->getNumOperands() - 1))->getZExtValue(); 1034 if (ExtensionType == ISD::SEXTLOAD) 1035 FromType = NVPTX::PTXLdStInstCode::Signed; 1036 else if (ScalarVT.isFloatingPoint()) 1037 FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped 1038 : NVPTX::PTXLdStInstCode::Float; 1039 else 1040 FromType = NVPTX::PTXLdStInstCode::Unsigned; 1041 1042 unsigned VecType; 1043 1044 switch (N->getOpcode()) { 1045 case NVPTXISD::LoadV2: 1046 VecType = NVPTX::PTXLdStInstCode::V2; 1047 break; 1048 case NVPTXISD::LoadV4: 1049 VecType = NVPTX::PTXLdStInstCode::V4; 1050 break; 1051 default: 1052 return false; 1053 } 1054 1055 EVT EltVT = N->getValueType(0); 1056 1057 // v8f16 is a special case. PTX doesn't have ld.v8.f16 1058 // instruction. Instead, we split the vector into v2f16 chunks and 1059 // load them with ld.v4.b32. 1060 if (EltVT == MVT::v2f16) { 1061 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); 1062 EltVT = MVT::i32; 1063 FromType = NVPTX::PTXLdStInstCode::Untyped; 1064 FromTypeWidth = 32; 1065 } 1066 1067 if (SelectDirectAddr(Op1, Addr)) { 1068 switch (N->getOpcode()) { 1069 default: 1070 return false; 1071 case NVPTXISD::LoadV2: 1072 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1073 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar, 1074 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar, 1075 NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar, 1076 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar); 1077 break; 1078 case NVPTXISD::LoadV4: 1079 Opcode = 1080 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar, 1081 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None, 1082 NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar, 1083 NVPTX::LDV_f32_v4_avar, None); 1084 break; 1085 } 1086 if (!Opcode) 1087 return false; 1088 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1089 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1090 getI32Imm(FromTypeWidth, DL), Addr, Chain }; 1091 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1092 } else if (PointerSize == 64 1093 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset) 1094 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) { 1095 switch (N->getOpcode()) { 1096 default: 1097 return false; 1098 case NVPTXISD::LoadV2: 1099 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1100 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi, 1101 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi, 1102 NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi, 1103 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi); 1104 break; 1105 case NVPTXISD::LoadV4: 1106 Opcode = 1107 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi, 1108 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None, 1109 NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi, 1110 NVPTX::LDV_f32_v4_asi, None); 1111 break; 1112 } 1113 if (!Opcode) 1114 return false; 1115 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1116 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1117 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1118 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1119 } else if (PointerSize == 64 1120 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1121 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1122 if (PointerSize == 64) { 1123 switch (N->getOpcode()) { 1124 default: 1125 return false; 1126 case NVPTXISD::LoadV2: 1127 Opcode = pickOpcodeForVT( 1128 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64, 1129 NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64, 1130 NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64, 1131 NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64, 1132 NVPTX::LDV_f64_v2_ari_64); 1133 break; 1134 case NVPTXISD::LoadV4: 1135 Opcode = pickOpcodeForVT( 1136 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64, 1137 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None, 1138 NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64, 1139 NVPTX::LDV_f32_v4_ari_64, None); 1140 break; 1141 } 1142 } else { 1143 switch (N->getOpcode()) { 1144 default: 1145 return false; 1146 case NVPTXISD::LoadV2: 1147 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1148 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari, 1149 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari, 1150 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari, 1151 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari); 1152 break; 1153 case NVPTXISD::LoadV4: 1154 Opcode = 1155 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari, 1156 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None, 1157 NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari, 1158 NVPTX::LDV_f32_v4_ari, None); 1159 break; 1160 } 1161 } 1162 if (!Opcode) 1163 return false; 1164 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1165 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1166 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain }; 1167 1168 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1169 } else { 1170 if (PointerSize == 64) { 1171 switch (N->getOpcode()) { 1172 default: 1173 return false; 1174 case NVPTXISD::LoadV2: 1175 Opcode = pickOpcodeForVT( 1176 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64, 1177 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64, 1178 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64, 1179 NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64, 1180 NVPTX::LDV_f64_v2_areg_64); 1181 break; 1182 case NVPTXISD::LoadV4: 1183 Opcode = pickOpcodeForVT( 1184 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64, 1185 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None, 1186 NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64, 1187 NVPTX::LDV_f32_v4_areg_64, None); 1188 break; 1189 } 1190 } else { 1191 switch (N->getOpcode()) { 1192 default: 1193 return false; 1194 case NVPTXISD::LoadV2: 1195 Opcode = 1196 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg, 1197 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg, 1198 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg, 1199 NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg, 1200 NVPTX::LDV_f64_v2_areg); 1201 break; 1202 case NVPTXISD::LoadV4: 1203 Opcode = pickOpcodeForVT( 1204 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg, 1205 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None, 1206 NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg, 1207 NVPTX::LDV_f32_v4_areg, None); 1208 break; 1209 } 1210 } 1211 if (!Opcode) 1212 return false; 1213 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL), 1214 getI32Imm(VecType, DL), getI32Imm(FromType, DL), 1215 getI32Imm(FromTypeWidth, DL), Op1, Chain }; 1216 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops); 1217 } 1218 1219 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1220 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1221 1222 ReplaceNode(N, LD); 1223 return true; 1224 } 1225 1226 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { 1227 1228 SDValue Chain = N->getOperand(0); 1229 SDValue Op1; 1230 MemSDNode *Mem; 1231 bool IsLDG = true; 1232 1233 // If this is an LDG intrinsic, the address is the third operand. If its an 1234 // LDG/LDU SD node (from custom vector handling), then its the second operand 1235 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { 1236 Op1 = N->getOperand(2); 1237 Mem = cast<MemIntrinsicSDNode>(N); 1238 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 1239 switch (IID) { 1240 default: 1241 return false; 1242 case Intrinsic::nvvm_ldg_global_f: 1243 case Intrinsic::nvvm_ldg_global_i: 1244 case Intrinsic::nvvm_ldg_global_p: 1245 IsLDG = true; 1246 break; 1247 case Intrinsic::nvvm_ldu_global_f: 1248 case Intrinsic::nvvm_ldu_global_i: 1249 case Intrinsic::nvvm_ldu_global_p: 1250 IsLDG = false; 1251 break; 1252 } 1253 } else { 1254 Op1 = N->getOperand(1); 1255 Mem = cast<MemSDNode>(N); 1256 } 1257 1258 Optional<unsigned> Opcode; 1259 SDLoc DL(N); 1260 SDNode *LD; 1261 SDValue Base, Offset, Addr; 1262 1263 EVT EltVT = Mem->getMemoryVT(); 1264 unsigned NumElts = 1; 1265 if (EltVT.isVector()) { 1266 NumElts = EltVT.getVectorNumElements(); 1267 EltVT = EltVT.getVectorElementType(); 1268 // vectors of f16 are loaded/stored as multiples of v2f16 elements. 1269 if (EltVT == MVT::f16 && N->getValueType(0) == MVT::v2f16) { 1270 assert(NumElts % 2 == 0 && "Vector must have even number of elements"); 1271 EltVT = MVT::v2f16; 1272 NumElts /= 2; 1273 } 1274 } 1275 1276 // Build the "promoted" result VTList for the load. If we are really loading 1277 // i8s, then the return type will be promoted to i16 since we do not expose 1278 // 8-bit registers in NVPTX. 1279 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT; 1280 SmallVector<EVT, 5> InstVTs; 1281 for (unsigned i = 0; i != NumElts; ++i) { 1282 InstVTs.push_back(NodeVT); 1283 } 1284 InstVTs.push_back(MVT::Other); 1285 SDVTList InstVTList = CurDAG->getVTList(InstVTs); 1286 1287 if (SelectDirectAddr(Op1, Addr)) { 1288 switch (N->getOpcode()) { 1289 default: 1290 return false; 1291 case ISD::LOAD: 1292 case ISD::INTRINSIC_W_CHAIN: 1293 if (IsLDG) 1294 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1295 NVPTX::INT_PTX_LDG_GLOBAL_i8avar, 1296 NVPTX::INT_PTX_LDG_GLOBAL_i16avar, 1297 NVPTX::INT_PTX_LDG_GLOBAL_i32avar, 1298 NVPTX::INT_PTX_LDG_GLOBAL_i64avar, 1299 NVPTX::INT_PTX_LDG_GLOBAL_f16avar, 1300 NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar, 1301 NVPTX::INT_PTX_LDG_GLOBAL_f32avar, 1302 NVPTX::INT_PTX_LDG_GLOBAL_f64avar); 1303 else 1304 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1305 NVPTX::INT_PTX_LDU_GLOBAL_i8avar, 1306 NVPTX::INT_PTX_LDU_GLOBAL_i16avar, 1307 NVPTX::INT_PTX_LDU_GLOBAL_i32avar, 1308 NVPTX::INT_PTX_LDU_GLOBAL_i64avar, 1309 NVPTX::INT_PTX_LDU_GLOBAL_f16avar, 1310 NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar, 1311 NVPTX::INT_PTX_LDU_GLOBAL_f32avar, 1312 NVPTX::INT_PTX_LDU_GLOBAL_f64avar); 1313 break; 1314 case NVPTXISD::LoadV2: 1315 case NVPTXISD::LDGV2: 1316 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1317 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar, 1318 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar, 1319 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar, 1320 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar, 1321 NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar, 1322 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar, 1323 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar, 1324 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar); 1325 break; 1326 case NVPTXISD::LDUV2: 1327 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1328 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar, 1329 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar, 1330 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar, 1331 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar, 1332 NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar, 1333 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar, 1334 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar, 1335 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar); 1336 break; 1337 case NVPTXISD::LoadV4: 1338 case NVPTXISD::LDGV4: 1339 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1340 NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar, 1341 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar, 1342 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None, 1343 NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar, 1344 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar, 1345 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None); 1346 break; 1347 case NVPTXISD::LDUV4: 1348 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1349 NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar, 1350 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar, 1351 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None, 1352 NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar, 1353 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar, 1354 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None); 1355 break; 1356 } 1357 if (!Opcode) 1358 return false; 1359 SDValue Ops[] = { Addr, Chain }; 1360 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1361 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset) 1362 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) { 1363 if (TM.is64Bit()) { 1364 switch (N->getOpcode()) { 1365 default: 1366 return false; 1367 case ISD::LOAD: 1368 case ISD::INTRINSIC_W_CHAIN: 1369 if (IsLDG) 1370 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1371 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64, 1372 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64, 1373 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64, 1374 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64, 1375 NVPTX::INT_PTX_LDG_GLOBAL_f16ari64, 1376 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64, 1377 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64, 1378 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64); 1379 else 1380 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1381 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64, 1382 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64, 1383 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64, 1384 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64, 1385 NVPTX::INT_PTX_LDU_GLOBAL_f16ari64, 1386 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64, 1387 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64, 1388 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64); 1389 break; 1390 case NVPTXISD::LoadV2: 1391 case NVPTXISD::LDGV2: 1392 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1393 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64, 1394 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64, 1395 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64, 1396 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64, 1397 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64, 1398 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64, 1399 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64, 1400 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64); 1401 break; 1402 case NVPTXISD::LDUV2: 1403 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1404 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64, 1405 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64, 1406 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64, 1407 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64, 1408 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64, 1409 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64, 1410 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64, 1411 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64); 1412 break; 1413 case NVPTXISD::LoadV4: 1414 case NVPTXISD::LDGV4: 1415 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1416 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64, 1417 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64, 1418 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None, 1419 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64, 1420 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64, 1421 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None); 1422 break; 1423 case NVPTXISD::LDUV4: 1424 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1425 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64, 1426 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64, 1427 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None, 1428 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64, 1429 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64, 1430 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None); 1431 break; 1432 } 1433 } else { 1434 switch (N->getOpcode()) { 1435 default: 1436 return false; 1437 case ISD::LOAD: 1438 case ISD::INTRINSIC_W_CHAIN: 1439 if (IsLDG) 1440 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1441 NVPTX::INT_PTX_LDG_GLOBAL_i8ari, 1442 NVPTX::INT_PTX_LDG_GLOBAL_i16ari, 1443 NVPTX::INT_PTX_LDG_GLOBAL_i32ari, 1444 NVPTX::INT_PTX_LDG_GLOBAL_i64ari, 1445 NVPTX::INT_PTX_LDG_GLOBAL_f16ari, 1446 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari, 1447 NVPTX::INT_PTX_LDG_GLOBAL_f32ari, 1448 NVPTX::INT_PTX_LDG_GLOBAL_f64ari); 1449 else 1450 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1451 NVPTX::INT_PTX_LDU_GLOBAL_i8ari, 1452 NVPTX::INT_PTX_LDU_GLOBAL_i16ari, 1453 NVPTX::INT_PTX_LDU_GLOBAL_i32ari, 1454 NVPTX::INT_PTX_LDU_GLOBAL_i64ari, 1455 NVPTX::INT_PTX_LDU_GLOBAL_f16ari, 1456 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari, 1457 NVPTX::INT_PTX_LDU_GLOBAL_f32ari, 1458 NVPTX::INT_PTX_LDU_GLOBAL_f64ari); 1459 break; 1460 case NVPTXISD::LoadV2: 1461 case NVPTXISD::LDGV2: 1462 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1463 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32, 1464 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32, 1465 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32, 1466 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32, 1467 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32, 1468 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32, 1469 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32, 1470 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32); 1471 break; 1472 case NVPTXISD::LDUV2: 1473 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1474 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32, 1475 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32, 1476 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32, 1477 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32, 1478 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32, 1479 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32, 1480 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32, 1481 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32); 1482 break; 1483 case NVPTXISD::LoadV4: 1484 case NVPTXISD::LDGV4: 1485 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1486 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32, 1487 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32, 1488 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None, 1489 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32, 1490 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32, 1491 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None); 1492 break; 1493 case NVPTXISD::LDUV4: 1494 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1495 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32, 1496 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32, 1497 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None, 1498 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32, 1499 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32, 1500 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None); 1501 break; 1502 } 1503 } 1504 if (!Opcode) 1505 return false; 1506 SDValue Ops[] = {Base, Offset, Chain}; 1507 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1508 } else { 1509 if (TM.is64Bit()) { 1510 switch (N->getOpcode()) { 1511 default: 1512 return false; 1513 case ISD::LOAD: 1514 case ISD::INTRINSIC_W_CHAIN: 1515 if (IsLDG) 1516 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1517 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64, 1518 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64, 1519 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64, 1520 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64, 1521 NVPTX::INT_PTX_LDG_GLOBAL_f16areg64, 1522 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64, 1523 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64, 1524 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64); 1525 else 1526 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1527 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64, 1528 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64, 1529 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64, 1530 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64, 1531 NVPTX::INT_PTX_LDU_GLOBAL_f16areg64, 1532 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64, 1533 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64, 1534 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64); 1535 break; 1536 case NVPTXISD::LoadV2: 1537 case NVPTXISD::LDGV2: 1538 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1539 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64, 1540 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64, 1541 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64, 1542 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64, 1543 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64, 1544 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64, 1545 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64, 1546 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64); 1547 break; 1548 case NVPTXISD::LDUV2: 1549 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1550 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64, 1551 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64, 1552 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64, 1553 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64, 1554 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64, 1555 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64, 1556 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64, 1557 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64); 1558 break; 1559 case NVPTXISD::LoadV4: 1560 case NVPTXISD::LDGV4: 1561 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1562 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64, 1563 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64, 1564 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None, 1565 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64, 1566 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64, 1567 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None); 1568 break; 1569 case NVPTXISD::LDUV4: 1570 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1571 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64, 1572 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64, 1573 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None, 1574 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64, 1575 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64, 1576 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None); 1577 break; 1578 } 1579 } else { 1580 switch (N->getOpcode()) { 1581 default: 1582 return false; 1583 case ISD::LOAD: 1584 case ISD::INTRINSIC_W_CHAIN: 1585 if (IsLDG) 1586 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1587 NVPTX::INT_PTX_LDG_GLOBAL_i8areg, 1588 NVPTX::INT_PTX_LDG_GLOBAL_i16areg, 1589 NVPTX::INT_PTX_LDG_GLOBAL_i32areg, 1590 NVPTX::INT_PTX_LDG_GLOBAL_i64areg, 1591 NVPTX::INT_PTX_LDG_GLOBAL_f16areg, 1592 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg, 1593 NVPTX::INT_PTX_LDG_GLOBAL_f32areg, 1594 NVPTX::INT_PTX_LDG_GLOBAL_f64areg); 1595 else 1596 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1597 NVPTX::INT_PTX_LDU_GLOBAL_i8areg, 1598 NVPTX::INT_PTX_LDU_GLOBAL_i16areg, 1599 NVPTX::INT_PTX_LDU_GLOBAL_i32areg, 1600 NVPTX::INT_PTX_LDU_GLOBAL_i64areg, 1601 NVPTX::INT_PTX_LDU_GLOBAL_f16areg, 1602 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg, 1603 NVPTX::INT_PTX_LDU_GLOBAL_f32areg, 1604 NVPTX::INT_PTX_LDU_GLOBAL_f64areg); 1605 break; 1606 case NVPTXISD::LoadV2: 1607 case NVPTXISD::LDGV2: 1608 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1609 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32, 1610 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32, 1611 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32, 1612 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32, 1613 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32, 1614 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32, 1615 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32, 1616 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32); 1617 break; 1618 case NVPTXISD::LDUV2: 1619 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1620 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32, 1621 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32, 1622 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32, 1623 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32, 1624 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32, 1625 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32, 1626 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32, 1627 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32); 1628 break; 1629 case NVPTXISD::LoadV4: 1630 case NVPTXISD::LDGV4: 1631 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1632 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32, 1633 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32, 1634 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None, 1635 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32, 1636 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32, 1637 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None); 1638 break; 1639 case NVPTXISD::LDUV4: 1640 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1641 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32, 1642 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32, 1643 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None, 1644 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32, 1645 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32, 1646 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None); 1647 break; 1648 } 1649 } 1650 if (!Opcode) 1651 return false; 1652 SDValue Ops[] = { Op1, Chain }; 1653 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops); 1654 } 1655 1656 MachineMemOperand *MemRef = Mem->getMemOperand(); 1657 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef}); 1658 1659 // For automatic generation of LDG (through SelectLoad[Vector], not the 1660 // intrinsics), we may have an extending load like: 1661 // 1662 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64 1663 // 1664 // In this case, the matching logic above will select a load for the original 1665 // memory type (in this case, i8) and our types will not match (the node needs 1666 // to return an i32 in this case). Our LDG/LDU nodes do not support the 1667 // concept of sign-/zero-extension, so emulate it here by adding an explicit 1668 // CVT instruction. Ptxas should clean up any redundancies here. 1669 1670 EVT OrigType = N->getValueType(0); 1671 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N); 1672 1673 if (OrigType != EltVT && LdNode) { 1674 // We have an extending-load. The instruction we selected operates on the 1675 // smaller type, but the SDNode we are replacing has the larger type. We 1676 // need to emit a CVT to make the types match. 1677 bool IsSigned = LdNode->getExtensionType() == ISD::SEXTLOAD; 1678 unsigned CvtOpc = GetConvertOpcode(OrigType.getSimpleVT(), 1679 EltVT.getSimpleVT(), IsSigned); 1680 1681 // For each output value, apply the manual sign/zero-extension and make sure 1682 // all users of the load go through that CVT. 1683 for (unsigned i = 0; i != NumElts; ++i) { 1684 SDValue Res(LD, i); 1685 SDValue OrigVal(N, i); 1686 1687 SDNode *CvtNode = 1688 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res, 1689 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, 1690 DL, MVT::i32)); 1691 ReplaceUses(OrigVal, SDValue(CvtNode, 0)); 1692 } 1693 } 1694 1695 ReplaceNode(N, LD); 1696 return true; 1697 } 1698 1699 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { 1700 SDLoc dl(N); 1701 MemSDNode *ST = cast<MemSDNode>(N); 1702 assert(ST->writeMem() && "Expected store"); 1703 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N); 1704 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N); 1705 assert((PlainStore || AtomicStore) && "Expected store"); 1706 EVT StoreVT = ST->getMemoryVT(); 1707 SDNode *NVPTXST = nullptr; 1708 1709 // do not support pre/post inc/dec 1710 if (PlainStore && PlainStore->isIndexed()) 1711 return false; 1712 1713 if (!StoreVT.isSimple()) 1714 return false; 1715 1716 AtomicOrdering Ordering = ST->getSuccessOrdering(); 1717 // In order to lower atomic loads with stronger guarantees we would need to 1718 // use store.release or insert fences. However these features were only added 1719 // with PTX ISA 6.0 / sm_70. 1720 // TODO: Check if we can actually use the new instructions and implement them. 1721 if (isStrongerThanMonotonic(Ordering)) 1722 return false; 1723 1724 // Address Space Setting 1725 unsigned int CodeAddrSpace = getCodeAddrSpace(ST); 1726 unsigned int PointerSize = 1727 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace()); 1728 1729 // Volatile Setting 1730 // - .volatile is only available for .global and .shared 1731 // - .volatile has the same memory synchronization semantics as .relaxed.sys 1732 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic; 1733 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1734 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1735 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1736 isVolatile = false; 1737 1738 // Vector Setting 1739 MVT SimpleVT = StoreVT.getSimpleVT(); 1740 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar; 1741 1742 // Type Setting: toType + toTypeWidth 1743 // - for integer type, always use 'u' 1744 // 1745 MVT ScalarVT = SimpleVT.getScalarType(); 1746 unsigned toTypeWidth = ScalarVT.getSizeInBits(); 1747 if (SimpleVT.isVector()) { 1748 assert(StoreVT == MVT::v2f16 && "Unexpected vector type"); 1749 // v2f16 is stored using st.b32 1750 toTypeWidth = 32; 1751 } 1752 1753 unsigned int toType; 1754 if (ScalarVT.isFloatingPoint()) 1755 // f16 uses .b16 as its storage type. 1756 toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped 1757 : NVPTX::PTXLdStInstCode::Float; 1758 else 1759 toType = NVPTX::PTXLdStInstCode::Unsigned; 1760 1761 // Create the machine instruction DAG 1762 SDValue Chain = ST->getChain(); 1763 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); 1764 SDValue BasePtr = ST->getBasePtr(); 1765 SDValue Addr; 1766 SDValue Offset, Base; 1767 Optional<unsigned> Opcode; 1768 MVT::SimpleValueType SourceVT = 1769 Value.getNode()->getSimpleValueType(0).SimpleTy; 1770 1771 if (SelectDirectAddr(BasePtr, Addr)) { 1772 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar, 1773 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar, 1774 NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar, 1775 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar); 1776 if (!Opcode) 1777 return false; 1778 SDValue Ops[] = {Value, 1779 getI32Imm(isVolatile, dl), 1780 getI32Imm(CodeAddrSpace, dl), 1781 getI32Imm(vecType, dl), 1782 getI32Imm(toType, dl), 1783 getI32Imm(toTypeWidth, dl), 1784 Addr, 1785 Chain}; 1786 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1787 } else if (PointerSize == 64 1788 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset) 1789 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) { 1790 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi, 1791 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi, 1792 NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi, 1793 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi); 1794 if (!Opcode) 1795 return false; 1796 SDValue Ops[] = {Value, 1797 getI32Imm(isVolatile, dl), 1798 getI32Imm(CodeAddrSpace, dl), 1799 getI32Imm(vecType, dl), 1800 getI32Imm(toType, dl), 1801 getI32Imm(toTypeWidth, dl), 1802 Base, 1803 Offset, 1804 Chain}; 1805 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1806 } else if (PointerSize == 64 1807 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset) 1808 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) { 1809 if (PointerSize == 64) 1810 Opcode = pickOpcodeForVT( 1811 SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64, 1812 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64, 1813 NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64); 1814 else 1815 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari, 1816 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari, 1817 NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari, 1818 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari); 1819 if (!Opcode) 1820 return false; 1821 1822 SDValue Ops[] = {Value, 1823 getI32Imm(isVolatile, dl), 1824 getI32Imm(CodeAddrSpace, dl), 1825 getI32Imm(vecType, dl), 1826 getI32Imm(toType, dl), 1827 getI32Imm(toTypeWidth, dl), 1828 Base, 1829 Offset, 1830 Chain}; 1831 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1832 } else { 1833 if (PointerSize == 64) 1834 Opcode = 1835 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64, 1836 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64, 1837 NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64, 1838 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64); 1839 else 1840 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg, 1841 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg, 1842 NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg, 1843 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg); 1844 if (!Opcode) 1845 return false; 1846 SDValue Ops[] = {Value, 1847 getI32Imm(isVolatile, dl), 1848 getI32Imm(CodeAddrSpace, dl), 1849 getI32Imm(vecType, dl), 1850 getI32Imm(toType, dl), 1851 getI32Imm(toTypeWidth, dl), 1852 BasePtr, 1853 Chain}; 1854 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops); 1855 } 1856 1857 if (!NVPTXST) 1858 return false; 1859 1860 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 1861 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef}); 1862 ReplaceNode(N, NVPTXST); 1863 return true; 1864 } 1865 1866 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { 1867 SDValue Chain = N->getOperand(0); 1868 SDValue Op1 = N->getOperand(1); 1869 SDValue Addr, Offset, Base; 1870 Optional<unsigned> Opcode; 1871 SDLoc DL(N); 1872 SDNode *ST; 1873 EVT EltVT = Op1.getValueType(); 1874 MemSDNode *MemSD = cast<MemSDNode>(N); 1875 EVT StoreVT = MemSD->getMemoryVT(); 1876 1877 // Address Space Setting 1878 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD); 1879 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) { 1880 report_fatal_error("Cannot store to pointer that points to constant " 1881 "memory space"); 1882 } 1883 unsigned int PointerSize = 1884 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace()); 1885 1886 // Volatile Setting 1887 // - .volatile is only availalble for .global and .shared 1888 bool IsVolatile = MemSD->isVolatile(); 1889 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL && 1890 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED && 1891 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC) 1892 IsVolatile = false; 1893 1894 // Type Setting: toType + toTypeWidth 1895 // - for integer type, always use 'u' 1896 assert(StoreVT.isSimple() && "Store value is not simple"); 1897 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType(); 1898 unsigned ToTypeWidth = ScalarVT.getSizeInBits(); 1899 unsigned ToType; 1900 if (ScalarVT.isFloatingPoint()) 1901 ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped 1902 : NVPTX::PTXLdStInstCode::Float; 1903 else 1904 ToType = NVPTX::PTXLdStInstCode::Unsigned; 1905 1906 SmallVector<SDValue, 12> StOps; 1907 SDValue N2; 1908 unsigned VecType; 1909 1910 switch (N->getOpcode()) { 1911 case NVPTXISD::StoreV2: 1912 VecType = NVPTX::PTXLdStInstCode::V2; 1913 StOps.push_back(N->getOperand(1)); 1914 StOps.push_back(N->getOperand(2)); 1915 N2 = N->getOperand(3); 1916 break; 1917 case NVPTXISD::StoreV4: 1918 VecType = NVPTX::PTXLdStInstCode::V4; 1919 StOps.push_back(N->getOperand(1)); 1920 StOps.push_back(N->getOperand(2)); 1921 StOps.push_back(N->getOperand(3)); 1922 StOps.push_back(N->getOperand(4)); 1923 N2 = N->getOperand(5); 1924 break; 1925 default: 1926 return false; 1927 } 1928 1929 // v8f16 is a special case. PTX doesn't have st.v8.f16 1930 // instruction. Instead, we split the vector into v2f16 chunks and 1931 // store them with st.v4.b32. 1932 if (EltVT == MVT::v2f16) { 1933 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); 1934 EltVT = MVT::i32; 1935 ToType = NVPTX::PTXLdStInstCode::Untyped; 1936 ToTypeWidth = 32; 1937 } 1938 1939 StOps.push_back(getI32Imm(IsVolatile, DL)); 1940 StOps.push_back(getI32Imm(CodeAddrSpace, DL)); 1941 StOps.push_back(getI32Imm(VecType, DL)); 1942 StOps.push_back(getI32Imm(ToType, DL)); 1943 StOps.push_back(getI32Imm(ToTypeWidth, DL)); 1944 1945 if (SelectDirectAddr(N2, Addr)) { 1946 switch (N->getOpcode()) { 1947 default: 1948 return false; 1949 case NVPTXISD::StoreV2: 1950 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1951 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar, 1952 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar, 1953 NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar, 1954 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar); 1955 break; 1956 case NVPTXISD::StoreV4: 1957 Opcode = 1958 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar, 1959 NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None, 1960 NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar, 1961 NVPTX::STV_f32_v4_avar, None); 1962 break; 1963 } 1964 StOps.push_back(Addr); 1965 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset) 1966 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) { 1967 switch (N->getOpcode()) { 1968 default: 1969 return false; 1970 case NVPTXISD::StoreV2: 1971 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 1972 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi, 1973 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi, 1974 NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi, 1975 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi); 1976 break; 1977 case NVPTXISD::StoreV4: 1978 Opcode = 1979 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi, 1980 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None, 1981 NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi, 1982 NVPTX::STV_f32_v4_asi, None); 1983 break; 1984 } 1985 StOps.push_back(Base); 1986 StOps.push_back(Offset); 1987 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset) 1988 : SelectADDRri(N2.getNode(), N2, Base, Offset)) { 1989 if (PointerSize == 64) { 1990 switch (N->getOpcode()) { 1991 default: 1992 return false; 1993 case NVPTXISD::StoreV2: 1994 Opcode = pickOpcodeForVT( 1995 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64, 1996 NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64, 1997 NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64, 1998 NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64, 1999 NVPTX::STV_f64_v2_ari_64); 2000 break; 2001 case NVPTXISD::StoreV4: 2002 Opcode = pickOpcodeForVT( 2003 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64, 2004 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None, 2005 NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64, 2006 NVPTX::STV_f32_v4_ari_64, None); 2007 break; 2008 } 2009 } else { 2010 switch (N->getOpcode()) { 2011 default: 2012 return false; 2013 case NVPTXISD::StoreV2: 2014 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, 2015 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari, 2016 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari, 2017 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari, 2018 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari); 2019 break; 2020 case NVPTXISD::StoreV4: 2021 Opcode = 2022 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari, 2023 NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None, 2024 NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari, 2025 NVPTX::STV_f32_v4_ari, None); 2026 break; 2027 } 2028 } 2029 StOps.push_back(Base); 2030 StOps.push_back(Offset); 2031 } else { 2032 if (PointerSize == 64) { 2033 switch (N->getOpcode()) { 2034 default: 2035 return false; 2036 case NVPTXISD::StoreV2: 2037 Opcode = pickOpcodeForVT( 2038 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64, 2039 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64, 2040 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64, 2041 NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64, 2042 NVPTX::STV_f64_v2_areg_64); 2043 break; 2044 case NVPTXISD::StoreV4: 2045 Opcode = pickOpcodeForVT( 2046 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64, 2047 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None, 2048 NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64, 2049 NVPTX::STV_f32_v4_areg_64, None); 2050 break; 2051 } 2052 } else { 2053 switch (N->getOpcode()) { 2054 default: 2055 return false; 2056 case NVPTXISD::StoreV2: 2057 Opcode = 2058 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg, 2059 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg, 2060 NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg, 2061 NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg, 2062 NVPTX::STV_f64_v2_areg); 2063 break; 2064 case NVPTXISD::StoreV4: 2065 Opcode = 2066 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg, 2067 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None, 2068 NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg, 2069 NVPTX::STV_f32_v4_areg, None); 2070 break; 2071 } 2072 } 2073 StOps.push_back(N2); 2074 } 2075 2076 if (!Opcode) 2077 return false; 2078 2079 StOps.push_back(Chain); 2080 2081 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps); 2082 2083 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2084 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef}); 2085 2086 ReplaceNode(N, ST); 2087 return true; 2088 } 2089 2090 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { 2091 SDValue Chain = Node->getOperand(0); 2092 SDValue Offset = Node->getOperand(2); 2093 SDValue Flag = Node->getOperand(3); 2094 SDLoc DL(Node); 2095 MemSDNode *Mem = cast<MemSDNode>(Node); 2096 2097 unsigned VecSize; 2098 switch (Node->getOpcode()) { 2099 default: 2100 return false; 2101 case NVPTXISD::LoadParam: 2102 VecSize = 1; 2103 break; 2104 case NVPTXISD::LoadParamV2: 2105 VecSize = 2; 2106 break; 2107 case NVPTXISD::LoadParamV4: 2108 VecSize = 4; 2109 break; 2110 } 2111 2112 EVT EltVT = Node->getValueType(0); 2113 EVT MemVT = Mem->getMemoryVT(); 2114 2115 Optional<unsigned> Opcode; 2116 2117 switch (VecSize) { 2118 default: 2119 return false; 2120 case 1: 2121 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, 2122 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, 2123 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64, 2124 NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2, 2125 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64); 2126 break; 2127 case 2: 2128 Opcode = 2129 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, 2130 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, 2131 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16, 2132 NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32, 2133 NVPTX::LoadParamMemV2F64); 2134 break; 2135 case 4: 2136 Opcode = pickOpcodeForVT( 2137 MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, 2138 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None, 2139 NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2, 2140 NVPTX::LoadParamMemV4F32, None); 2141 break; 2142 } 2143 if (!Opcode) 2144 return false; 2145 2146 SDVTList VTs; 2147 if (VecSize == 1) { 2148 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); 2149 } else if (VecSize == 2) { 2150 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); 2151 } else { 2152 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; 2153 VTs = CurDAG->getVTList(EVTs); 2154 } 2155 2156 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2157 2158 SmallVector<SDValue, 2> Ops; 2159 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2160 Ops.push_back(Chain); 2161 Ops.push_back(Flag); 2162 2163 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); 2164 return true; 2165 } 2166 2167 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { 2168 SDLoc DL(N); 2169 SDValue Chain = N->getOperand(0); 2170 SDValue Offset = N->getOperand(1); 2171 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2172 MemSDNode *Mem = cast<MemSDNode>(N); 2173 2174 // How many elements do we have? 2175 unsigned NumElts = 1; 2176 switch (N->getOpcode()) { 2177 default: 2178 return false; 2179 case NVPTXISD::StoreRetval: 2180 NumElts = 1; 2181 break; 2182 case NVPTXISD::StoreRetvalV2: 2183 NumElts = 2; 2184 break; 2185 case NVPTXISD::StoreRetvalV4: 2186 NumElts = 4; 2187 break; 2188 } 2189 2190 // Build vector of operands 2191 SmallVector<SDValue, 6> Ops; 2192 for (unsigned i = 0; i < NumElts; ++i) 2193 Ops.push_back(N->getOperand(i + 2)); 2194 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2195 Ops.push_back(Chain); 2196 2197 // Determine target opcode 2198 // If we have an i1, use an 8-bit store. The lowering code in 2199 // NVPTXISelLowering will have already emitted an upcast. 2200 Optional<unsigned> Opcode = 0; 2201 switch (NumElts) { 2202 default: 2203 return false; 2204 case 1: 2205 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2206 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16, 2207 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64, 2208 NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2, 2209 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64); 2210 break; 2211 case 2: 2212 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2213 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16, 2214 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64, 2215 NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2, 2216 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64); 2217 break; 2218 case 4: 2219 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2220 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, 2221 NVPTX::StoreRetvalV4I32, None, 2222 NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2, 2223 NVPTX::StoreRetvalV4F32, None); 2224 break; 2225 } 2226 if (!Opcode) 2227 return false; 2228 2229 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops); 2230 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2231 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2232 2233 ReplaceNode(N, Ret); 2234 return true; 2235 } 2236 2237 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { 2238 SDLoc DL(N); 2239 SDValue Chain = N->getOperand(0); 2240 SDValue Param = N->getOperand(1); 2241 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue(); 2242 SDValue Offset = N->getOperand(2); 2243 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); 2244 MemSDNode *Mem = cast<MemSDNode>(N); 2245 SDValue Flag = N->getOperand(N->getNumOperands() - 1); 2246 2247 // How many elements do we have? 2248 unsigned NumElts = 1; 2249 switch (N->getOpcode()) { 2250 default: 2251 return false; 2252 case NVPTXISD::StoreParamU32: 2253 case NVPTXISD::StoreParamS32: 2254 case NVPTXISD::StoreParam: 2255 NumElts = 1; 2256 break; 2257 case NVPTXISD::StoreParamV2: 2258 NumElts = 2; 2259 break; 2260 case NVPTXISD::StoreParamV4: 2261 NumElts = 4; 2262 break; 2263 } 2264 2265 // Build vector of operands 2266 SmallVector<SDValue, 8> Ops; 2267 for (unsigned i = 0; i < NumElts; ++i) 2268 Ops.push_back(N->getOperand(i + 3)); 2269 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32)); 2270 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); 2271 Ops.push_back(Chain); 2272 Ops.push_back(Flag); 2273 2274 // Determine target opcode 2275 // If we have an i1, use an 8-bit store. The lowering code in 2276 // NVPTXISelLowering will have already emitted an upcast. 2277 Optional<unsigned> Opcode = 0; 2278 switch (N->getOpcode()) { 2279 default: 2280 switch (NumElts) { 2281 default: 2282 return false; 2283 case 1: 2284 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2285 NVPTX::StoreParamI8, NVPTX::StoreParamI16, 2286 NVPTX::StoreParamI32, NVPTX::StoreParamI64, 2287 NVPTX::StoreParamF16, NVPTX::StoreParamF16x2, 2288 NVPTX::StoreParamF32, NVPTX::StoreParamF64); 2289 break; 2290 case 2: 2291 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2292 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16, 2293 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64, 2294 NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2, 2295 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64); 2296 break; 2297 case 4: 2298 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, 2299 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16, 2300 NVPTX::StoreParamV4I32, None, 2301 NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2, 2302 NVPTX::StoreParamV4F32, None); 2303 break; 2304 } 2305 if (!Opcode) 2306 return false; 2307 break; 2308 // Special case: if we have a sign-extend/zero-extend node, insert the 2309 // conversion instruction first, and use that as the value operand to 2310 // the selected StoreParam node. 2311 case NVPTXISD::StoreParamU32: { 2312 Opcode = NVPTX::StoreParamI32; 2313 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2314 MVT::i32); 2315 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL, 2316 MVT::i32, Ops[0], CvtNone); 2317 Ops[0] = SDValue(Cvt, 0); 2318 break; 2319 } 2320 case NVPTXISD::StoreParamS32: { 2321 Opcode = NVPTX::StoreParamI32; 2322 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, 2323 MVT::i32); 2324 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL, 2325 MVT::i32, Ops[0], CvtNone); 2326 Ops[0] = SDValue(Cvt, 0); 2327 break; 2328 } 2329 } 2330 2331 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); 2332 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); 2333 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); 2334 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); 2335 2336 ReplaceNode(N, Ret); 2337 return true; 2338 } 2339 2340 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) { 2341 unsigned Opc = 0; 2342 2343 switch (N->getOpcode()) { 2344 default: return false; 2345 case NVPTXISD::Tex1DFloatS32: 2346 Opc = NVPTX::TEX_1D_F32_S32_RR; 2347 break; 2348 case NVPTXISD::Tex1DFloatFloat: 2349 Opc = NVPTX::TEX_1D_F32_F32_RR; 2350 break; 2351 case NVPTXISD::Tex1DFloatFloatLevel: 2352 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR; 2353 break; 2354 case NVPTXISD::Tex1DFloatFloatGrad: 2355 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR; 2356 break; 2357 case NVPTXISD::Tex1DS32S32: 2358 Opc = NVPTX::TEX_1D_S32_S32_RR; 2359 break; 2360 case NVPTXISD::Tex1DS32Float: 2361 Opc = NVPTX::TEX_1D_S32_F32_RR; 2362 break; 2363 case NVPTXISD::Tex1DS32FloatLevel: 2364 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR; 2365 break; 2366 case NVPTXISD::Tex1DS32FloatGrad: 2367 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR; 2368 break; 2369 case NVPTXISD::Tex1DU32S32: 2370 Opc = NVPTX::TEX_1D_U32_S32_RR; 2371 break; 2372 case NVPTXISD::Tex1DU32Float: 2373 Opc = NVPTX::TEX_1D_U32_F32_RR; 2374 break; 2375 case NVPTXISD::Tex1DU32FloatLevel: 2376 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR; 2377 break; 2378 case NVPTXISD::Tex1DU32FloatGrad: 2379 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR; 2380 break; 2381 case NVPTXISD::Tex1DArrayFloatS32: 2382 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR; 2383 break; 2384 case NVPTXISD::Tex1DArrayFloatFloat: 2385 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR; 2386 break; 2387 case NVPTXISD::Tex1DArrayFloatFloatLevel: 2388 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR; 2389 break; 2390 case NVPTXISD::Tex1DArrayFloatFloatGrad: 2391 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR; 2392 break; 2393 case NVPTXISD::Tex1DArrayS32S32: 2394 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR; 2395 break; 2396 case NVPTXISD::Tex1DArrayS32Float: 2397 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR; 2398 break; 2399 case NVPTXISD::Tex1DArrayS32FloatLevel: 2400 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR; 2401 break; 2402 case NVPTXISD::Tex1DArrayS32FloatGrad: 2403 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR; 2404 break; 2405 case NVPTXISD::Tex1DArrayU32S32: 2406 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR; 2407 break; 2408 case NVPTXISD::Tex1DArrayU32Float: 2409 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR; 2410 break; 2411 case NVPTXISD::Tex1DArrayU32FloatLevel: 2412 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR; 2413 break; 2414 case NVPTXISD::Tex1DArrayU32FloatGrad: 2415 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR; 2416 break; 2417 case NVPTXISD::Tex2DFloatS32: 2418 Opc = NVPTX::TEX_2D_F32_S32_RR; 2419 break; 2420 case NVPTXISD::Tex2DFloatFloat: 2421 Opc = NVPTX::TEX_2D_F32_F32_RR; 2422 break; 2423 case NVPTXISD::Tex2DFloatFloatLevel: 2424 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR; 2425 break; 2426 case NVPTXISD::Tex2DFloatFloatGrad: 2427 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR; 2428 break; 2429 case NVPTXISD::Tex2DS32S32: 2430 Opc = NVPTX::TEX_2D_S32_S32_RR; 2431 break; 2432 case NVPTXISD::Tex2DS32Float: 2433 Opc = NVPTX::TEX_2D_S32_F32_RR; 2434 break; 2435 case NVPTXISD::Tex2DS32FloatLevel: 2436 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR; 2437 break; 2438 case NVPTXISD::Tex2DS32FloatGrad: 2439 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR; 2440 break; 2441 case NVPTXISD::Tex2DU32S32: 2442 Opc = NVPTX::TEX_2D_U32_S32_RR; 2443 break; 2444 case NVPTXISD::Tex2DU32Float: 2445 Opc = NVPTX::TEX_2D_U32_F32_RR; 2446 break; 2447 case NVPTXISD::Tex2DU32FloatLevel: 2448 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR; 2449 break; 2450 case NVPTXISD::Tex2DU32FloatGrad: 2451 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR; 2452 break; 2453 case NVPTXISD::Tex2DArrayFloatS32: 2454 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR; 2455 break; 2456 case NVPTXISD::Tex2DArrayFloatFloat: 2457 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR; 2458 break; 2459 case NVPTXISD::Tex2DArrayFloatFloatLevel: 2460 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR; 2461 break; 2462 case NVPTXISD::Tex2DArrayFloatFloatGrad: 2463 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR; 2464 break; 2465 case NVPTXISD::Tex2DArrayS32S32: 2466 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR; 2467 break; 2468 case NVPTXISD::Tex2DArrayS32Float: 2469 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR; 2470 break; 2471 case NVPTXISD::Tex2DArrayS32FloatLevel: 2472 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR; 2473 break; 2474 case NVPTXISD::Tex2DArrayS32FloatGrad: 2475 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR; 2476 break; 2477 case NVPTXISD::Tex2DArrayU32S32: 2478 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR; 2479 break; 2480 case NVPTXISD::Tex2DArrayU32Float: 2481 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR; 2482 break; 2483 case NVPTXISD::Tex2DArrayU32FloatLevel: 2484 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR; 2485 break; 2486 case NVPTXISD::Tex2DArrayU32FloatGrad: 2487 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR; 2488 break; 2489 case NVPTXISD::Tex3DFloatS32: 2490 Opc = NVPTX::TEX_3D_F32_S32_RR; 2491 break; 2492 case NVPTXISD::Tex3DFloatFloat: 2493 Opc = NVPTX::TEX_3D_F32_F32_RR; 2494 break; 2495 case NVPTXISD::Tex3DFloatFloatLevel: 2496 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR; 2497 break; 2498 case NVPTXISD::Tex3DFloatFloatGrad: 2499 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR; 2500 break; 2501 case NVPTXISD::Tex3DS32S32: 2502 Opc = NVPTX::TEX_3D_S32_S32_RR; 2503 break; 2504 case NVPTXISD::Tex3DS32Float: 2505 Opc = NVPTX::TEX_3D_S32_F32_RR; 2506 break; 2507 case NVPTXISD::Tex3DS32FloatLevel: 2508 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR; 2509 break; 2510 case NVPTXISD::Tex3DS32FloatGrad: 2511 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR; 2512 break; 2513 case NVPTXISD::Tex3DU32S32: 2514 Opc = NVPTX::TEX_3D_U32_S32_RR; 2515 break; 2516 case NVPTXISD::Tex3DU32Float: 2517 Opc = NVPTX::TEX_3D_U32_F32_RR; 2518 break; 2519 case NVPTXISD::Tex3DU32FloatLevel: 2520 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR; 2521 break; 2522 case NVPTXISD::Tex3DU32FloatGrad: 2523 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR; 2524 break; 2525 case NVPTXISD::TexCubeFloatFloat: 2526 Opc = NVPTX::TEX_CUBE_F32_F32_RR; 2527 break; 2528 case NVPTXISD::TexCubeFloatFloatLevel: 2529 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR; 2530 break; 2531 case NVPTXISD::TexCubeS32Float: 2532 Opc = NVPTX::TEX_CUBE_S32_F32_RR; 2533 break; 2534 case NVPTXISD::TexCubeS32FloatLevel: 2535 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR; 2536 break; 2537 case NVPTXISD::TexCubeU32Float: 2538 Opc = NVPTX::TEX_CUBE_U32_F32_RR; 2539 break; 2540 case NVPTXISD::TexCubeU32FloatLevel: 2541 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR; 2542 break; 2543 case NVPTXISD::TexCubeArrayFloatFloat: 2544 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR; 2545 break; 2546 case NVPTXISD::TexCubeArrayFloatFloatLevel: 2547 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR; 2548 break; 2549 case NVPTXISD::TexCubeArrayS32Float: 2550 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR; 2551 break; 2552 case NVPTXISD::TexCubeArrayS32FloatLevel: 2553 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR; 2554 break; 2555 case NVPTXISD::TexCubeArrayU32Float: 2556 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR; 2557 break; 2558 case NVPTXISD::TexCubeArrayU32FloatLevel: 2559 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR; 2560 break; 2561 case NVPTXISD::Tld4R2DFloatFloat: 2562 Opc = NVPTX::TLD4_R_2D_F32_F32_RR; 2563 break; 2564 case NVPTXISD::Tld4G2DFloatFloat: 2565 Opc = NVPTX::TLD4_G_2D_F32_F32_RR; 2566 break; 2567 case NVPTXISD::Tld4B2DFloatFloat: 2568 Opc = NVPTX::TLD4_B_2D_F32_F32_RR; 2569 break; 2570 case NVPTXISD::Tld4A2DFloatFloat: 2571 Opc = NVPTX::TLD4_A_2D_F32_F32_RR; 2572 break; 2573 case NVPTXISD::Tld4R2DS64Float: 2574 Opc = NVPTX::TLD4_R_2D_S32_F32_RR; 2575 break; 2576 case NVPTXISD::Tld4G2DS64Float: 2577 Opc = NVPTX::TLD4_G_2D_S32_F32_RR; 2578 break; 2579 case NVPTXISD::Tld4B2DS64Float: 2580 Opc = NVPTX::TLD4_B_2D_S32_F32_RR; 2581 break; 2582 case NVPTXISD::Tld4A2DS64Float: 2583 Opc = NVPTX::TLD4_A_2D_S32_F32_RR; 2584 break; 2585 case NVPTXISD::Tld4R2DU64Float: 2586 Opc = NVPTX::TLD4_R_2D_U32_F32_RR; 2587 break; 2588 case NVPTXISD::Tld4G2DU64Float: 2589 Opc = NVPTX::TLD4_G_2D_U32_F32_RR; 2590 break; 2591 case NVPTXISD::Tld4B2DU64Float: 2592 Opc = NVPTX::TLD4_B_2D_U32_F32_RR; 2593 break; 2594 case NVPTXISD::Tld4A2DU64Float: 2595 Opc = NVPTX::TLD4_A_2D_U32_F32_RR; 2596 break; 2597 case NVPTXISD::TexUnified1DFloatS32: 2598 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R; 2599 break; 2600 case NVPTXISD::TexUnified1DFloatFloat: 2601 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R; 2602 break; 2603 case NVPTXISD::TexUnified1DFloatFloatLevel: 2604 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R; 2605 break; 2606 case NVPTXISD::TexUnified1DFloatFloatGrad: 2607 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R; 2608 break; 2609 case NVPTXISD::TexUnified1DS32S32: 2610 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R; 2611 break; 2612 case NVPTXISD::TexUnified1DS32Float: 2613 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R; 2614 break; 2615 case NVPTXISD::TexUnified1DS32FloatLevel: 2616 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R; 2617 break; 2618 case NVPTXISD::TexUnified1DS32FloatGrad: 2619 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R; 2620 break; 2621 case NVPTXISD::TexUnified1DU32S32: 2622 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R; 2623 break; 2624 case NVPTXISD::TexUnified1DU32Float: 2625 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R; 2626 break; 2627 case NVPTXISD::TexUnified1DU32FloatLevel: 2628 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R; 2629 break; 2630 case NVPTXISD::TexUnified1DU32FloatGrad: 2631 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R; 2632 break; 2633 case NVPTXISD::TexUnified1DArrayFloatS32: 2634 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R; 2635 break; 2636 case NVPTXISD::TexUnified1DArrayFloatFloat: 2637 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R; 2638 break; 2639 case NVPTXISD::TexUnified1DArrayFloatFloatLevel: 2640 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R; 2641 break; 2642 case NVPTXISD::TexUnified1DArrayFloatFloatGrad: 2643 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R; 2644 break; 2645 case NVPTXISD::TexUnified1DArrayS32S32: 2646 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R; 2647 break; 2648 case NVPTXISD::TexUnified1DArrayS32Float: 2649 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R; 2650 break; 2651 case NVPTXISD::TexUnified1DArrayS32FloatLevel: 2652 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R; 2653 break; 2654 case NVPTXISD::TexUnified1DArrayS32FloatGrad: 2655 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R; 2656 break; 2657 case NVPTXISD::TexUnified1DArrayU32S32: 2658 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R; 2659 break; 2660 case NVPTXISD::TexUnified1DArrayU32Float: 2661 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R; 2662 break; 2663 case NVPTXISD::TexUnified1DArrayU32FloatLevel: 2664 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R; 2665 break; 2666 case NVPTXISD::TexUnified1DArrayU32FloatGrad: 2667 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R; 2668 break; 2669 case NVPTXISD::TexUnified2DFloatS32: 2670 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R; 2671 break; 2672 case NVPTXISD::TexUnified2DFloatFloat: 2673 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R; 2674 break; 2675 case NVPTXISD::TexUnified2DFloatFloatLevel: 2676 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R; 2677 break; 2678 case NVPTXISD::TexUnified2DFloatFloatGrad: 2679 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R; 2680 break; 2681 case NVPTXISD::TexUnified2DS32S32: 2682 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R; 2683 break; 2684 case NVPTXISD::TexUnified2DS32Float: 2685 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R; 2686 break; 2687 case NVPTXISD::TexUnified2DS32FloatLevel: 2688 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R; 2689 break; 2690 case NVPTXISD::TexUnified2DS32FloatGrad: 2691 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R; 2692 break; 2693 case NVPTXISD::TexUnified2DU32S32: 2694 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R; 2695 break; 2696 case NVPTXISD::TexUnified2DU32Float: 2697 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R; 2698 break; 2699 case NVPTXISD::TexUnified2DU32FloatLevel: 2700 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R; 2701 break; 2702 case NVPTXISD::TexUnified2DU32FloatGrad: 2703 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R; 2704 break; 2705 case NVPTXISD::TexUnified2DArrayFloatS32: 2706 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R; 2707 break; 2708 case NVPTXISD::TexUnified2DArrayFloatFloat: 2709 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R; 2710 break; 2711 case NVPTXISD::TexUnified2DArrayFloatFloatLevel: 2712 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R; 2713 break; 2714 case NVPTXISD::TexUnified2DArrayFloatFloatGrad: 2715 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R; 2716 break; 2717 case NVPTXISD::TexUnified2DArrayS32S32: 2718 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R; 2719 break; 2720 case NVPTXISD::TexUnified2DArrayS32Float: 2721 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R; 2722 break; 2723 case NVPTXISD::TexUnified2DArrayS32FloatLevel: 2724 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R; 2725 break; 2726 case NVPTXISD::TexUnified2DArrayS32FloatGrad: 2727 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R; 2728 break; 2729 case NVPTXISD::TexUnified2DArrayU32S32: 2730 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R; 2731 break; 2732 case NVPTXISD::TexUnified2DArrayU32Float: 2733 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R; 2734 break; 2735 case NVPTXISD::TexUnified2DArrayU32FloatLevel: 2736 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R; 2737 break; 2738 case NVPTXISD::TexUnified2DArrayU32FloatGrad: 2739 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R; 2740 break; 2741 case NVPTXISD::TexUnified3DFloatS32: 2742 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R; 2743 break; 2744 case NVPTXISD::TexUnified3DFloatFloat: 2745 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R; 2746 break; 2747 case NVPTXISD::TexUnified3DFloatFloatLevel: 2748 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R; 2749 break; 2750 case NVPTXISD::TexUnified3DFloatFloatGrad: 2751 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R; 2752 break; 2753 case NVPTXISD::TexUnified3DS32S32: 2754 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R; 2755 break; 2756 case NVPTXISD::TexUnified3DS32Float: 2757 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R; 2758 break; 2759 case NVPTXISD::TexUnified3DS32FloatLevel: 2760 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R; 2761 break; 2762 case NVPTXISD::TexUnified3DS32FloatGrad: 2763 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R; 2764 break; 2765 case NVPTXISD::TexUnified3DU32S32: 2766 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R; 2767 break; 2768 case NVPTXISD::TexUnified3DU32Float: 2769 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R; 2770 break; 2771 case NVPTXISD::TexUnified3DU32FloatLevel: 2772 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R; 2773 break; 2774 case NVPTXISD::TexUnified3DU32FloatGrad: 2775 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R; 2776 break; 2777 case NVPTXISD::TexUnifiedCubeFloatFloat: 2778 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R; 2779 break; 2780 case NVPTXISD::TexUnifiedCubeFloatFloatLevel: 2781 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R; 2782 break; 2783 case NVPTXISD::TexUnifiedCubeS32Float: 2784 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R; 2785 break; 2786 case NVPTXISD::TexUnifiedCubeS32FloatLevel: 2787 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R; 2788 break; 2789 case NVPTXISD::TexUnifiedCubeU32Float: 2790 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R; 2791 break; 2792 case NVPTXISD::TexUnifiedCubeU32FloatLevel: 2793 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R; 2794 break; 2795 case NVPTXISD::TexUnifiedCubeArrayFloatFloat: 2796 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R; 2797 break; 2798 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel: 2799 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R; 2800 break; 2801 case NVPTXISD::TexUnifiedCubeArrayS32Float: 2802 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R; 2803 break; 2804 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel: 2805 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R; 2806 break; 2807 case NVPTXISD::TexUnifiedCubeArrayU32Float: 2808 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R; 2809 break; 2810 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel: 2811 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R; 2812 break; 2813 case NVPTXISD::Tld4UnifiedR2DFloatFloat: 2814 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R; 2815 break; 2816 case NVPTXISD::Tld4UnifiedG2DFloatFloat: 2817 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R; 2818 break; 2819 case NVPTXISD::Tld4UnifiedB2DFloatFloat: 2820 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R; 2821 break; 2822 case NVPTXISD::Tld4UnifiedA2DFloatFloat: 2823 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R; 2824 break; 2825 case NVPTXISD::Tld4UnifiedR2DS64Float: 2826 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R; 2827 break; 2828 case NVPTXISD::Tld4UnifiedG2DS64Float: 2829 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R; 2830 break; 2831 case NVPTXISD::Tld4UnifiedB2DS64Float: 2832 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R; 2833 break; 2834 case NVPTXISD::Tld4UnifiedA2DS64Float: 2835 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R; 2836 break; 2837 case NVPTXISD::Tld4UnifiedR2DU64Float: 2838 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R; 2839 break; 2840 case NVPTXISD::Tld4UnifiedG2DU64Float: 2841 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R; 2842 break; 2843 case NVPTXISD::Tld4UnifiedB2DU64Float: 2844 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R; 2845 break; 2846 case NVPTXISD::Tld4UnifiedA2DU64Float: 2847 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R; 2848 break; 2849 } 2850 2851 // Copy over operands 2852 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 2853 Ops.push_back(N->getOperand(0)); // Move chain to the back. 2854 2855 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 2856 return true; 2857 } 2858 2859 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) { 2860 unsigned Opc = 0; 2861 switch (N->getOpcode()) { 2862 default: return false; 2863 case NVPTXISD::Suld1DI8Clamp: 2864 Opc = NVPTX::SULD_1D_I8_CLAMP_R; 2865 break; 2866 case NVPTXISD::Suld1DI16Clamp: 2867 Opc = NVPTX::SULD_1D_I16_CLAMP_R; 2868 break; 2869 case NVPTXISD::Suld1DI32Clamp: 2870 Opc = NVPTX::SULD_1D_I32_CLAMP_R; 2871 break; 2872 case NVPTXISD::Suld1DI64Clamp: 2873 Opc = NVPTX::SULD_1D_I64_CLAMP_R; 2874 break; 2875 case NVPTXISD::Suld1DV2I8Clamp: 2876 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R; 2877 break; 2878 case NVPTXISD::Suld1DV2I16Clamp: 2879 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R; 2880 break; 2881 case NVPTXISD::Suld1DV2I32Clamp: 2882 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R; 2883 break; 2884 case NVPTXISD::Suld1DV2I64Clamp: 2885 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R; 2886 break; 2887 case NVPTXISD::Suld1DV4I8Clamp: 2888 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R; 2889 break; 2890 case NVPTXISD::Suld1DV4I16Clamp: 2891 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R; 2892 break; 2893 case NVPTXISD::Suld1DV4I32Clamp: 2894 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R; 2895 break; 2896 case NVPTXISD::Suld1DArrayI8Clamp: 2897 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R; 2898 break; 2899 case NVPTXISD::Suld1DArrayI16Clamp: 2900 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R; 2901 break; 2902 case NVPTXISD::Suld1DArrayI32Clamp: 2903 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R; 2904 break; 2905 case NVPTXISD::Suld1DArrayI64Clamp: 2906 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R; 2907 break; 2908 case NVPTXISD::Suld1DArrayV2I8Clamp: 2909 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R; 2910 break; 2911 case NVPTXISD::Suld1DArrayV2I16Clamp: 2912 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R; 2913 break; 2914 case NVPTXISD::Suld1DArrayV2I32Clamp: 2915 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R; 2916 break; 2917 case NVPTXISD::Suld1DArrayV2I64Clamp: 2918 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R; 2919 break; 2920 case NVPTXISD::Suld1DArrayV4I8Clamp: 2921 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R; 2922 break; 2923 case NVPTXISD::Suld1DArrayV4I16Clamp: 2924 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R; 2925 break; 2926 case NVPTXISD::Suld1DArrayV4I32Clamp: 2927 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R; 2928 break; 2929 case NVPTXISD::Suld2DI8Clamp: 2930 Opc = NVPTX::SULD_2D_I8_CLAMP_R; 2931 break; 2932 case NVPTXISD::Suld2DI16Clamp: 2933 Opc = NVPTX::SULD_2D_I16_CLAMP_R; 2934 break; 2935 case NVPTXISD::Suld2DI32Clamp: 2936 Opc = NVPTX::SULD_2D_I32_CLAMP_R; 2937 break; 2938 case NVPTXISD::Suld2DI64Clamp: 2939 Opc = NVPTX::SULD_2D_I64_CLAMP_R; 2940 break; 2941 case NVPTXISD::Suld2DV2I8Clamp: 2942 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R; 2943 break; 2944 case NVPTXISD::Suld2DV2I16Clamp: 2945 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R; 2946 break; 2947 case NVPTXISD::Suld2DV2I32Clamp: 2948 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R; 2949 break; 2950 case NVPTXISD::Suld2DV2I64Clamp: 2951 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R; 2952 break; 2953 case NVPTXISD::Suld2DV4I8Clamp: 2954 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R; 2955 break; 2956 case NVPTXISD::Suld2DV4I16Clamp: 2957 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R; 2958 break; 2959 case NVPTXISD::Suld2DV4I32Clamp: 2960 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R; 2961 break; 2962 case NVPTXISD::Suld2DArrayI8Clamp: 2963 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R; 2964 break; 2965 case NVPTXISD::Suld2DArrayI16Clamp: 2966 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R; 2967 break; 2968 case NVPTXISD::Suld2DArrayI32Clamp: 2969 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R; 2970 break; 2971 case NVPTXISD::Suld2DArrayI64Clamp: 2972 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R; 2973 break; 2974 case NVPTXISD::Suld2DArrayV2I8Clamp: 2975 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R; 2976 break; 2977 case NVPTXISD::Suld2DArrayV2I16Clamp: 2978 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R; 2979 break; 2980 case NVPTXISD::Suld2DArrayV2I32Clamp: 2981 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R; 2982 break; 2983 case NVPTXISD::Suld2DArrayV2I64Clamp: 2984 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R; 2985 break; 2986 case NVPTXISD::Suld2DArrayV4I8Clamp: 2987 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R; 2988 break; 2989 case NVPTXISD::Suld2DArrayV4I16Clamp: 2990 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R; 2991 break; 2992 case NVPTXISD::Suld2DArrayV4I32Clamp: 2993 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R; 2994 break; 2995 case NVPTXISD::Suld3DI8Clamp: 2996 Opc = NVPTX::SULD_3D_I8_CLAMP_R; 2997 break; 2998 case NVPTXISD::Suld3DI16Clamp: 2999 Opc = NVPTX::SULD_3D_I16_CLAMP_R; 3000 break; 3001 case NVPTXISD::Suld3DI32Clamp: 3002 Opc = NVPTX::SULD_3D_I32_CLAMP_R; 3003 break; 3004 case NVPTXISD::Suld3DI64Clamp: 3005 Opc = NVPTX::SULD_3D_I64_CLAMP_R; 3006 break; 3007 case NVPTXISD::Suld3DV2I8Clamp: 3008 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R; 3009 break; 3010 case NVPTXISD::Suld3DV2I16Clamp: 3011 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R; 3012 break; 3013 case NVPTXISD::Suld3DV2I32Clamp: 3014 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R; 3015 break; 3016 case NVPTXISD::Suld3DV2I64Clamp: 3017 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R; 3018 break; 3019 case NVPTXISD::Suld3DV4I8Clamp: 3020 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R; 3021 break; 3022 case NVPTXISD::Suld3DV4I16Clamp: 3023 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R; 3024 break; 3025 case NVPTXISD::Suld3DV4I32Clamp: 3026 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R; 3027 break; 3028 case NVPTXISD::Suld1DI8Trap: 3029 Opc = NVPTX::SULD_1D_I8_TRAP_R; 3030 break; 3031 case NVPTXISD::Suld1DI16Trap: 3032 Opc = NVPTX::SULD_1D_I16_TRAP_R; 3033 break; 3034 case NVPTXISD::Suld1DI32Trap: 3035 Opc = NVPTX::SULD_1D_I32_TRAP_R; 3036 break; 3037 case NVPTXISD::Suld1DI64Trap: 3038 Opc = NVPTX::SULD_1D_I64_TRAP_R; 3039 break; 3040 case NVPTXISD::Suld1DV2I8Trap: 3041 Opc = NVPTX::SULD_1D_V2I8_TRAP_R; 3042 break; 3043 case NVPTXISD::Suld1DV2I16Trap: 3044 Opc = NVPTX::SULD_1D_V2I16_TRAP_R; 3045 break; 3046 case NVPTXISD::Suld1DV2I32Trap: 3047 Opc = NVPTX::SULD_1D_V2I32_TRAP_R; 3048 break; 3049 case NVPTXISD::Suld1DV2I64Trap: 3050 Opc = NVPTX::SULD_1D_V2I64_TRAP_R; 3051 break; 3052 case NVPTXISD::Suld1DV4I8Trap: 3053 Opc = NVPTX::SULD_1D_V4I8_TRAP_R; 3054 break; 3055 case NVPTXISD::Suld1DV4I16Trap: 3056 Opc = NVPTX::SULD_1D_V4I16_TRAP_R; 3057 break; 3058 case NVPTXISD::Suld1DV4I32Trap: 3059 Opc = NVPTX::SULD_1D_V4I32_TRAP_R; 3060 break; 3061 case NVPTXISD::Suld1DArrayI8Trap: 3062 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R; 3063 break; 3064 case NVPTXISD::Suld1DArrayI16Trap: 3065 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R; 3066 break; 3067 case NVPTXISD::Suld1DArrayI32Trap: 3068 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R; 3069 break; 3070 case NVPTXISD::Suld1DArrayI64Trap: 3071 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R; 3072 break; 3073 case NVPTXISD::Suld1DArrayV2I8Trap: 3074 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R; 3075 break; 3076 case NVPTXISD::Suld1DArrayV2I16Trap: 3077 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R; 3078 break; 3079 case NVPTXISD::Suld1DArrayV2I32Trap: 3080 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R; 3081 break; 3082 case NVPTXISD::Suld1DArrayV2I64Trap: 3083 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R; 3084 break; 3085 case NVPTXISD::Suld1DArrayV4I8Trap: 3086 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R; 3087 break; 3088 case NVPTXISD::Suld1DArrayV4I16Trap: 3089 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R; 3090 break; 3091 case NVPTXISD::Suld1DArrayV4I32Trap: 3092 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R; 3093 break; 3094 case NVPTXISD::Suld2DI8Trap: 3095 Opc = NVPTX::SULD_2D_I8_TRAP_R; 3096 break; 3097 case NVPTXISD::Suld2DI16Trap: 3098 Opc = NVPTX::SULD_2D_I16_TRAP_R; 3099 break; 3100 case NVPTXISD::Suld2DI32Trap: 3101 Opc = NVPTX::SULD_2D_I32_TRAP_R; 3102 break; 3103 case NVPTXISD::Suld2DI64Trap: 3104 Opc = NVPTX::SULD_2D_I64_TRAP_R; 3105 break; 3106 case NVPTXISD::Suld2DV2I8Trap: 3107 Opc = NVPTX::SULD_2D_V2I8_TRAP_R; 3108 break; 3109 case NVPTXISD::Suld2DV2I16Trap: 3110 Opc = NVPTX::SULD_2D_V2I16_TRAP_R; 3111 break; 3112 case NVPTXISD::Suld2DV2I32Trap: 3113 Opc = NVPTX::SULD_2D_V2I32_TRAP_R; 3114 break; 3115 case NVPTXISD::Suld2DV2I64Trap: 3116 Opc = NVPTX::SULD_2D_V2I64_TRAP_R; 3117 break; 3118 case NVPTXISD::Suld2DV4I8Trap: 3119 Opc = NVPTX::SULD_2D_V4I8_TRAP_R; 3120 break; 3121 case NVPTXISD::Suld2DV4I16Trap: 3122 Opc = NVPTX::SULD_2D_V4I16_TRAP_R; 3123 break; 3124 case NVPTXISD::Suld2DV4I32Trap: 3125 Opc = NVPTX::SULD_2D_V4I32_TRAP_R; 3126 break; 3127 case NVPTXISD::Suld2DArrayI8Trap: 3128 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R; 3129 break; 3130 case NVPTXISD::Suld2DArrayI16Trap: 3131 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R; 3132 break; 3133 case NVPTXISD::Suld2DArrayI32Trap: 3134 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R; 3135 break; 3136 case NVPTXISD::Suld2DArrayI64Trap: 3137 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R; 3138 break; 3139 case NVPTXISD::Suld2DArrayV2I8Trap: 3140 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R; 3141 break; 3142 case NVPTXISD::Suld2DArrayV2I16Trap: 3143 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R; 3144 break; 3145 case NVPTXISD::Suld2DArrayV2I32Trap: 3146 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R; 3147 break; 3148 case NVPTXISD::Suld2DArrayV2I64Trap: 3149 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R; 3150 break; 3151 case NVPTXISD::Suld2DArrayV4I8Trap: 3152 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R; 3153 break; 3154 case NVPTXISD::Suld2DArrayV4I16Trap: 3155 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R; 3156 break; 3157 case NVPTXISD::Suld2DArrayV4I32Trap: 3158 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R; 3159 break; 3160 case NVPTXISD::Suld3DI8Trap: 3161 Opc = NVPTX::SULD_3D_I8_TRAP_R; 3162 break; 3163 case NVPTXISD::Suld3DI16Trap: 3164 Opc = NVPTX::SULD_3D_I16_TRAP_R; 3165 break; 3166 case NVPTXISD::Suld3DI32Trap: 3167 Opc = NVPTX::SULD_3D_I32_TRAP_R; 3168 break; 3169 case NVPTXISD::Suld3DI64Trap: 3170 Opc = NVPTX::SULD_3D_I64_TRAP_R; 3171 break; 3172 case NVPTXISD::Suld3DV2I8Trap: 3173 Opc = NVPTX::SULD_3D_V2I8_TRAP_R; 3174 break; 3175 case NVPTXISD::Suld3DV2I16Trap: 3176 Opc = NVPTX::SULD_3D_V2I16_TRAP_R; 3177 break; 3178 case NVPTXISD::Suld3DV2I32Trap: 3179 Opc = NVPTX::SULD_3D_V2I32_TRAP_R; 3180 break; 3181 case NVPTXISD::Suld3DV2I64Trap: 3182 Opc = NVPTX::SULD_3D_V2I64_TRAP_R; 3183 break; 3184 case NVPTXISD::Suld3DV4I8Trap: 3185 Opc = NVPTX::SULD_3D_V4I8_TRAP_R; 3186 break; 3187 case NVPTXISD::Suld3DV4I16Trap: 3188 Opc = NVPTX::SULD_3D_V4I16_TRAP_R; 3189 break; 3190 case NVPTXISD::Suld3DV4I32Trap: 3191 Opc = NVPTX::SULD_3D_V4I32_TRAP_R; 3192 break; 3193 case NVPTXISD::Suld1DI8Zero: 3194 Opc = NVPTX::SULD_1D_I8_ZERO_R; 3195 break; 3196 case NVPTXISD::Suld1DI16Zero: 3197 Opc = NVPTX::SULD_1D_I16_ZERO_R; 3198 break; 3199 case NVPTXISD::Suld1DI32Zero: 3200 Opc = NVPTX::SULD_1D_I32_ZERO_R; 3201 break; 3202 case NVPTXISD::Suld1DI64Zero: 3203 Opc = NVPTX::SULD_1D_I64_ZERO_R; 3204 break; 3205 case NVPTXISD::Suld1DV2I8Zero: 3206 Opc = NVPTX::SULD_1D_V2I8_ZERO_R; 3207 break; 3208 case NVPTXISD::Suld1DV2I16Zero: 3209 Opc = NVPTX::SULD_1D_V2I16_ZERO_R; 3210 break; 3211 case NVPTXISD::Suld1DV2I32Zero: 3212 Opc = NVPTX::SULD_1D_V2I32_ZERO_R; 3213 break; 3214 case NVPTXISD::Suld1DV2I64Zero: 3215 Opc = NVPTX::SULD_1D_V2I64_ZERO_R; 3216 break; 3217 case NVPTXISD::Suld1DV4I8Zero: 3218 Opc = NVPTX::SULD_1D_V4I8_ZERO_R; 3219 break; 3220 case NVPTXISD::Suld1DV4I16Zero: 3221 Opc = NVPTX::SULD_1D_V4I16_ZERO_R; 3222 break; 3223 case NVPTXISD::Suld1DV4I32Zero: 3224 Opc = NVPTX::SULD_1D_V4I32_ZERO_R; 3225 break; 3226 case NVPTXISD::Suld1DArrayI8Zero: 3227 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R; 3228 break; 3229 case NVPTXISD::Suld1DArrayI16Zero: 3230 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R; 3231 break; 3232 case NVPTXISD::Suld1DArrayI32Zero: 3233 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R; 3234 break; 3235 case NVPTXISD::Suld1DArrayI64Zero: 3236 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R; 3237 break; 3238 case NVPTXISD::Suld1DArrayV2I8Zero: 3239 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R; 3240 break; 3241 case NVPTXISD::Suld1DArrayV2I16Zero: 3242 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R; 3243 break; 3244 case NVPTXISD::Suld1DArrayV2I32Zero: 3245 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R; 3246 break; 3247 case NVPTXISD::Suld1DArrayV2I64Zero: 3248 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R; 3249 break; 3250 case NVPTXISD::Suld1DArrayV4I8Zero: 3251 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R; 3252 break; 3253 case NVPTXISD::Suld1DArrayV4I16Zero: 3254 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R; 3255 break; 3256 case NVPTXISD::Suld1DArrayV4I32Zero: 3257 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R; 3258 break; 3259 case NVPTXISD::Suld2DI8Zero: 3260 Opc = NVPTX::SULD_2D_I8_ZERO_R; 3261 break; 3262 case NVPTXISD::Suld2DI16Zero: 3263 Opc = NVPTX::SULD_2D_I16_ZERO_R; 3264 break; 3265 case NVPTXISD::Suld2DI32Zero: 3266 Opc = NVPTX::SULD_2D_I32_ZERO_R; 3267 break; 3268 case NVPTXISD::Suld2DI64Zero: 3269 Opc = NVPTX::SULD_2D_I64_ZERO_R; 3270 break; 3271 case NVPTXISD::Suld2DV2I8Zero: 3272 Opc = NVPTX::SULD_2D_V2I8_ZERO_R; 3273 break; 3274 case NVPTXISD::Suld2DV2I16Zero: 3275 Opc = NVPTX::SULD_2D_V2I16_ZERO_R; 3276 break; 3277 case NVPTXISD::Suld2DV2I32Zero: 3278 Opc = NVPTX::SULD_2D_V2I32_ZERO_R; 3279 break; 3280 case NVPTXISD::Suld2DV2I64Zero: 3281 Opc = NVPTX::SULD_2D_V2I64_ZERO_R; 3282 break; 3283 case NVPTXISD::Suld2DV4I8Zero: 3284 Opc = NVPTX::SULD_2D_V4I8_ZERO_R; 3285 break; 3286 case NVPTXISD::Suld2DV4I16Zero: 3287 Opc = NVPTX::SULD_2D_V4I16_ZERO_R; 3288 break; 3289 case NVPTXISD::Suld2DV4I32Zero: 3290 Opc = NVPTX::SULD_2D_V4I32_ZERO_R; 3291 break; 3292 case NVPTXISD::Suld2DArrayI8Zero: 3293 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R; 3294 break; 3295 case NVPTXISD::Suld2DArrayI16Zero: 3296 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R; 3297 break; 3298 case NVPTXISD::Suld2DArrayI32Zero: 3299 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R; 3300 break; 3301 case NVPTXISD::Suld2DArrayI64Zero: 3302 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R; 3303 break; 3304 case NVPTXISD::Suld2DArrayV2I8Zero: 3305 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R; 3306 break; 3307 case NVPTXISD::Suld2DArrayV2I16Zero: 3308 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R; 3309 break; 3310 case NVPTXISD::Suld2DArrayV2I32Zero: 3311 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R; 3312 break; 3313 case NVPTXISD::Suld2DArrayV2I64Zero: 3314 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R; 3315 break; 3316 case NVPTXISD::Suld2DArrayV4I8Zero: 3317 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R; 3318 break; 3319 case NVPTXISD::Suld2DArrayV4I16Zero: 3320 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R; 3321 break; 3322 case NVPTXISD::Suld2DArrayV4I32Zero: 3323 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R; 3324 break; 3325 case NVPTXISD::Suld3DI8Zero: 3326 Opc = NVPTX::SULD_3D_I8_ZERO_R; 3327 break; 3328 case NVPTXISD::Suld3DI16Zero: 3329 Opc = NVPTX::SULD_3D_I16_ZERO_R; 3330 break; 3331 case NVPTXISD::Suld3DI32Zero: 3332 Opc = NVPTX::SULD_3D_I32_ZERO_R; 3333 break; 3334 case NVPTXISD::Suld3DI64Zero: 3335 Opc = NVPTX::SULD_3D_I64_ZERO_R; 3336 break; 3337 case NVPTXISD::Suld3DV2I8Zero: 3338 Opc = NVPTX::SULD_3D_V2I8_ZERO_R; 3339 break; 3340 case NVPTXISD::Suld3DV2I16Zero: 3341 Opc = NVPTX::SULD_3D_V2I16_ZERO_R; 3342 break; 3343 case NVPTXISD::Suld3DV2I32Zero: 3344 Opc = NVPTX::SULD_3D_V2I32_ZERO_R; 3345 break; 3346 case NVPTXISD::Suld3DV2I64Zero: 3347 Opc = NVPTX::SULD_3D_V2I64_ZERO_R; 3348 break; 3349 case NVPTXISD::Suld3DV4I8Zero: 3350 Opc = NVPTX::SULD_3D_V4I8_ZERO_R; 3351 break; 3352 case NVPTXISD::Suld3DV4I16Zero: 3353 Opc = NVPTX::SULD_3D_V4I16_ZERO_R; 3354 break; 3355 case NVPTXISD::Suld3DV4I32Zero: 3356 Opc = NVPTX::SULD_3D_V4I32_ZERO_R; 3357 break; 3358 } 3359 3360 // Copy over operands 3361 SmallVector<SDValue, 8> Ops(drop_begin(N->ops())); 3362 Ops.push_back(N->getOperand(0)); // Move chain to the back. 3363 3364 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops)); 3365 return true; 3366 } 3367 3368 3369 /// SelectBFE - Look for instruction sequences that can be made more efficient 3370 /// by using the 'bfe' (bit-field extract) PTX instruction 3371 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { 3372 SDLoc DL(N); 3373 SDValue LHS = N->getOperand(0); 3374 SDValue RHS = N->getOperand(1); 3375 SDValue Len; 3376 SDValue Start; 3377 SDValue Val; 3378 bool IsSigned = false; 3379 3380 if (N->getOpcode() == ISD::AND) { 3381 // Canonicalize the operands 3382 // We want 'and %val, %mask' 3383 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) { 3384 std::swap(LHS, RHS); 3385 } 3386 3387 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS); 3388 if (!Mask) { 3389 // We need a constant mask on the RHS of the AND 3390 return false; 3391 } 3392 3393 // Extract the mask bits 3394 uint64_t MaskVal = Mask->getZExtValue(); 3395 if (!isMask_64(MaskVal)) { 3396 // We *could* handle shifted masks here, but doing so would require an 3397 // 'and' operation to fix up the low-order bits so we would trade 3398 // shr+and for bfe+and, which has the same throughput 3399 return false; 3400 } 3401 3402 // How many bits are in our mask? 3403 uint64_t NumBits = countTrailingOnes(MaskVal); 3404 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3405 3406 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) { 3407 // We have a 'srl/and' pair, extract the effective start bit and length 3408 Val = LHS.getNode()->getOperand(0); 3409 Start = LHS.getNode()->getOperand(1); 3410 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start); 3411 if (StartConst) { 3412 uint64_t StartVal = StartConst->getZExtValue(); 3413 // How many "good" bits do we have left? "good" is defined here as bits 3414 // that exist in the original value, not shifted in. 3415 uint64_t GoodBits = Start.getValueSizeInBits() - StartVal; 3416 if (NumBits > GoodBits) { 3417 // Do not handle the case where bits have been shifted in. In theory 3418 // we could handle this, but the cost is likely higher than just 3419 // emitting the srl/and pair. 3420 return false; 3421 } 3422 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32); 3423 } else { 3424 // Do not handle the case where the shift amount (can be zero if no srl 3425 // was found) is not constant. We could handle this case, but it would 3426 // require run-time logic that would be more expensive than just 3427 // emitting the srl/and pair. 3428 return false; 3429 } 3430 } else { 3431 // Do not handle the case where the LHS of the and is not a shift. While 3432 // it would be trivial to handle this case, it would just transform 3433 // 'and' -> 'bfe', but 'and' has higher-throughput. 3434 return false; 3435 } 3436 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) { 3437 if (LHS->getOpcode() == ISD::AND) { 3438 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS); 3439 if (!ShiftCnst) { 3440 // Shift amount must be constant 3441 return false; 3442 } 3443 3444 uint64_t ShiftAmt = ShiftCnst->getZExtValue(); 3445 3446 SDValue AndLHS = LHS->getOperand(0); 3447 SDValue AndRHS = LHS->getOperand(1); 3448 3449 // Canonicalize the AND to have the mask on the RHS 3450 if (isa<ConstantSDNode>(AndLHS)) { 3451 std::swap(AndLHS, AndRHS); 3452 } 3453 3454 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS); 3455 if (!MaskCnst) { 3456 // Mask must be constant 3457 return false; 3458 } 3459 3460 uint64_t MaskVal = MaskCnst->getZExtValue(); 3461 uint64_t NumZeros; 3462 uint64_t NumBits; 3463 if (isMask_64(MaskVal)) { 3464 NumZeros = 0; 3465 // The number of bits in the result bitfield will be the number of 3466 // trailing ones (the AND) minus the number of bits we shift off 3467 NumBits = countTrailingOnes(MaskVal) - ShiftAmt; 3468 } else if (isShiftedMask_64(MaskVal)) { 3469 NumZeros = countTrailingZeros(MaskVal); 3470 unsigned NumOnes = countTrailingOnes(MaskVal >> NumZeros); 3471 // The number of bits in the result bitfield will be the number of 3472 // trailing zeros plus the number of set bits in the mask minus the 3473 // number of bits we shift off 3474 NumBits = NumZeros + NumOnes - ShiftAmt; 3475 } else { 3476 // This is not a mask we can handle 3477 return false; 3478 } 3479 3480 if (ShiftAmt < NumZeros) { 3481 // Handling this case would require extra logic that would make this 3482 // transformation non-profitable 3483 return false; 3484 } 3485 3486 Val = AndLHS; 3487 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32); 3488 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32); 3489 } else if (LHS->getOpcode() == ISD::SHL) { 3490 // Here, we have a pattern like: 3491 // 3492 // (sra (shl val, NN), MM) 3493 // or 3494 // (srl (shl val, NN), MM) 3495 // 3496 // If MM >= NN, we can efficiently optimize this with bfe 3497 Val = LHS->getOperand(0); 3498 3499 SDValue ShlRHS = LHS->getOperand(1); 3500 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS); 3501 if (!ShlCnst) { 3502 // Shift amount must be constant 3503 return false; 3504 } 3505 uint64_t InnerShiftAmt = ShlCnst->getZExtValue(); 3506 3507 SDValue ShrRHS = RHS; 3508 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS); 3509 if (!ShrCnst) { 3510 // Shift amount must be constant 3511 return false; 3512 } 3513 uint64_t OuterShiftAmt = ShrCnst->getZExtValue(); 3514 3515 // To avoid extra codegen and be profitable, we need Outer >= Inner 3516 if (OuterShiftAmt < InnerShiftAmt) { 3517 return false; 3518 } 3519 3520 // If the outer shift is more than the type size, we have no bitfield to 3521 // extract (since we also check that the inner shift is <= the outer shift 3522 // then this also implies that the inner shift is < the type size) 3523 if (OuterShiftAmt >= Val.getValueSizeInBits()) { 3524 return false; 3525 } 3526 3527 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL, 3528 MVT::i32); 3529 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt, 3530 DL, MVT::i32); 3531 3532 if (N->getOpcode() == ISD::SRA) { 3533 // If we have a arithmetic right shift, we need to use the signed bfe 3534 // variant 3535 IsSigned = true; 3536 } 3537 } else { 3538 // No can do... 3539 return false; 3540 } 3541 } else { 3542 // No can do... 3543 return false; 3544 } 3545 3546 3547 unsigned Opc; 3548 // For the BFE operations we form here from "and" and "srl", always use the 3549 // unsigned variants. 3550 if (Val.getValueType() == MVT::i32) { 3551 if (IsSigned) { 3552 Opc = NVPTX::BFE_S32rii; 3553 } else { 3554 Opc = NVPTX::BFE_U32rii; 3555 } 3556 } else if (Val.getValueType() == MVT::i64) { 3557 if (IsSigned) { 3558 Opc = NVPTX::BFE_S64rii; 3559 } else { 3560 Opc = NVPTX::BFE_U64rii; 3561 } 3562 } else { 3563 // We cannot handle this type 3564 return false; 3565 } 3566 3567 SDValue Ops[] = { 3568 Val, Start, Len 3569 }; 3570 3571 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops)); 3572 return true; 3573 } 3574 3575 // SelectDirectAddr - Match a direct address for DAG. 3576 // A direct address could be a globaladdress or externalsymbol. 3577 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { 3578 // Return true if TGA or ES. 3579 if (N.getOpcode() == ISD::TargetGlobalAddress || 3580 N.getOpcode() == ISD::TargetExternalSymbol) { 3581 Address = N; 3582 return true; 3583 } 3584 if (N.getOpcode() == NVPTXISD::Wrapper) { 3585 Address = N.getOperand(0); 3586 return true; 3587 } 3588 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol 3589 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) { 3590 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC && 3591 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM && 3592 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam) 3593 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address); 3594 } 3595 return false; 3596 } 3597 3598 // symbol+offset 3599 bool NVPTXDAGToDAGISel::SelectADDRsi_imp( 3600 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3601 if (Addr.getOpcode() == ISD::ADD) { 3602 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3603 SDValue base = Addr.getOperand(0); 3604 if (SelectDirectAddr(base, Base)) { 3605 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3606 mvt); 3607 return true; 3608 } 3609 } 3610 } 3611 return false; 3612 } 3613 3614 // symbol+offset 3615 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr, 3616 SDValue &Base, SDValue &Offset) { 3617 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32); 3618 } 3619 3620 // symbol+offset 3621 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr, 3622 SDValue &Base, SDValue &Offset) { 3623 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64); 3624 } 3625 3626 // register+offset 3627 bool NVPTXDAGToDAGISel::SelectADDRri_imp( 3628 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) { 3629 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { 3630 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3631 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt); 3632 return true; 3633 } 3634 if (Addr.getOpcode() == ISD::TargetExternalSymbol || 3635 Addr.getOpcode() == ISD::TargetGlobalAddress) 3636 return false; // direct calls. 3637 3638 if (Addr.getOpcode() == ISD::ADD) { 3639 if (SelectDirectAddr(Addr.getOperand(0), Addr)) { 3640 return false; 3641 } 3642 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { 3643 if (FrameIndexSDNode *FIN = 3644 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) 3645 // Constant offset from frame ref. 3646 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt); 3647 else 3648 Base = Addr.getOperand(0); 3649 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode), 3650 mvt); 3651 return true; 3652 } 3653 } 3654 return false; 3655 } 3656 3657 // register+offset 3658 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr, 3659 SDValue &Base, SDValue &Offset) { 3660 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32); 3661 } 3662 3663 // register+offset 3664 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr, 3665 SDValue &Base, SDValue &Offset) { 3666 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64); 3667 } 3668 3669 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, 3670 unsigned int spN) const { 3671 const Value *Src = nullptr; 3672 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) { 3673 if (spN == 0 && mN->getMemOperand()->getPseudoValue()) 3674 return true; 3675 Src = mN->getMemOperand()->getValue(); 3676 } 3677 if (!Src) 3678 return false; 3679 if (auto *PT = dyn_cast<PointerType>(Src->getType())) 3680 return (PT->getAddressSpace() == spN); 3681 return false; 3682 } 3683 3684 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for 3685 /// inline asm expressions. 3686 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand( 3687 const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { 3688 SDValue Op0, Op1; 3689 switch (ConstraintID) { 3690 default: 3691 return true; 3692 case InlineAsm::Constraint_m: // memory 3693 if (SelectDirectAddr(Op, Op0)) { 3694 OutOps.push_back(Op0); 3695 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32)); 3696 return false; 3697 } 3698 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) { 3699 OutOps.push_back(Op0); 3700 OutOps.push_back(Op1); 3701 return false; 3702 } 3703 break; 3704 } 3705 return true; 3706 } 3707 3708 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a 3709 /// conversion from \p SrcTy to \p DestTy. 3710 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy, 3711 bool IsSigned) { 3712 switch (SrcTy.SimpleTy) { 3713 default: 3714 llvm_unreachable("Unhandled source type"); 3715 case MVT::i8: 3716 switch (DestTy.SimpleTy) { 3717 default: 3718 llvm_unreachable("Unhandled dest type"); 3719 case MVT::i16: 3720 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8; 3721 case MVT::i32: 3722 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8; 3723 case MVT::i64: 3724 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8; 3725 } 3726 case MVT::i16: 3727 switch (DestTy.SimpleTy) { 3728 default: 3729 llvm_unreachable("Unhandled dest type"); 3730 case MVT::i8: 3731 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16; 3732 case MVT::i32: 3733 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16; 3734 case MVT::i64: 3735 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16; 3736 } 3737 case MVT::i32: 3738 switch (DestTy.SimpleTy) { 3739 default: 3740 llvm_unreachable("Unhandled dest type"); 3741 case MVT::i8: 3742 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32; 3743 case MVT::i16: 3744 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32; 3745 case MVT::i64: 3746 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32; 3747 } 3748 case MVT::i64: 3749 switch (DestTy.SimpleTy) { 3750 default: 3751 llvm_unreachable("Unhandled dest type"); 3752 case MVT::i8: 3753 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64; 3754 case MVT::i16: 3755 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64; 3756 case MVT::i32: 3757 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64; 3758 } 3759 } 3760 } 3761