1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); 248 249 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 250 // elements for v3s16 251 getActionDefinitionsBuilder(G_PHI) 252 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 253 .legalFor(AllS32Vectors) 254 .legalFor(AllS64Vectors) 255 .legalFor(AddrSpaces64) 256 .legalFor(AddrSpaces32) 257 .clampScalar(0, S32, S256) 258 .widenScalarToNextPow2(0, 32) 259 .clampMaxNumElements(0, S32, 16) 260 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 261 .legalIf(isPointer(0)); 262 263 if (ST.has16BitInsts()) { 264 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 265 .legalFor({S32, S16}) 266 .clampScalar(0, S16, S32) 267 .scalarize(0); 268 } else { 269 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 270 .legalFor({S32}) 271 .clampScalar(0, S32, S32) 272 .scalarize(0); 273 } 274 275 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 276 .legalFor({S32}) 277 .clampScalar(0, S32, S32) 278 .scalarize(0); 279 280 // Report legal for any types we can handle anywhere. For the cases only legal 281 // on the SALU, RegBankSelect will be able to re-legalize. 282 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 283 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 284 .clampScalar(0, S32, S64) 285 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 287 .widenScalarToNextPow2(0) 288 .scalarize(0); 289 290 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 291 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 292 .legalFor({{S32, S1}}) 293 .clampScalar(0, S32, S32) 294 .scalarize(0); // TODO: Implement. 295 296 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297 .lower(); 298 299 getActionDefinitionsBuilder(G_BITCAST) 300 // Don't worry about the size constraint. 301 .legalIf(all(isRegisterType(0), isRegisterType(1))) 302 // FIXME: Testing hack 303 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 304 305 getActionDefinitionsBuilder(G_FCONSTANT) 306 .legalFor({S32, S64, S16}) 307 .clampScalar(0, S16, S64); 308 309 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 311 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 312 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313 .clampScalarOrElt(0, S32, S1024) 314 .legalIf(isMultiple32(0)) 315 .widenScalarToNextPow2(0, 32) 316 .clampMaxNumElements(0, S32, 16); 317 318 319 // FIXME: i1 operands to intrinsics should always be legal, but other i1 320 // values may not be legal. We need to figure out how to distinguish 321 // between these two scenarios. 322 getActionDefinitionsBuilder(G_CONSTANT) 323 .legalFor({S1, S32, S64, S16, GlobalPtr, 324 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 325 .clampScalar(0, S32, S64) 326 .widenScalarToNextPow2(0) 327 .legalIf(isPointer(0)); 328 329 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332 333 334 auto &FPOpActions = getActionDefinitionsBuilder( 335 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 336 .legalFor({S32, S64}); 337 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338 .customFor({S32, S64}); 339 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340 .customFor({S32, S64}); 341 342 if (ST.has16BitInsts()) { 343 if (ST.hasVOP3PInsts()) 344 FPOpActions.legalFor({S16, V2S16}); 345 else 346 FPOpActions.legalFor({S16}); 347 348 TrigActions.customFor({S16}); 349 FDIVActions.customFor({S16}); 350 } 351 352 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 353 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 354 355 if (ST.hasVOP3PInsts()) { 356 MinNumMaxNum.customFor(FPTypesPK16) 357 .clampMaxNumElements(0, S16, 2) 358 .clampScalar(0, S16, S64) 359 .scalarize(0); 360 } else if (ST.has16BitInsts()) { 361 MinNumMaxNum.customFor(FPTypes16) 362 .clampScalar(0, S16, S64) 363 .scalarize(0); 364 } else { 365 MinNumMaxNum.customFor(FPTypesBase) 366 .clampScalar(0, S32, S64) 367 .scalarize(0); 368 } 369 370 if (ST.hasVOP3PInsts()) 371 FPOpActions.clampMaxNumElements(0, S16, 2); 372 373 FPOpActions 374 .scalarize(0) 375 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 376 377 TrigActions 378 .scalarize(0) 379 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 380 381 FDIVActions 382 .scalarize(0) 383 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384 385 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 386 .legalFor(FPTypesPK16) 387 .clampMaxNumElements(0, S16, 2) 388 .scalarize(0) 389 .clampScalar(0, S16, S64); 390 391 // TODO: Implement 392 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 393 394 if (ST.has16BitInsts()) { 395 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 396 .legalFor({S32, S64, S16}) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 } else { 400 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 401 .legalFor({S32, S64}) 402 .scalarize(0) 403 .clampScalar(0, S32, S64); 404 } 405 406 getActionDefinitionsBuilder(G_FPTRUNC) 407 .legalFor({{S32, S64}, {S16, S32}}) 408 .scalarize(0); 409 410 getActionDefinitionsBuilder(G_FPEXT) 411 .legalFor({{S64, S32}, {S32, S16}}) 412 .lowerFor({{S64, S16}}) // FIXME: Implement 413 .scalarize(0); 414 415 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 416 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 417 418 getActionDefinitionsBuilder(G_FSUB) 419 // Use actual fsub instruction 420 .legalFor({S32}) 421 // Must use fadd + fneg 422 .lowerFor({S64, S16, V2S16}) 423 .scalarize(0) 424 .clampScalar(0, S32, S64); 425 426 // Whether this is legal depends on the floating point mode for the function. 427 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428 if (ST.hasMadF16()) 429 FMad.customFor({S32, S16}); 430 else 431 FMad.customFor({S32}); 432 FMad.scalarize(0) 433 .lower(); 434 435 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 436 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 437 {S32, S1}, {S64, S1}, {S16, S1}, 438 {S96, S32}, 439 // FIXME: Hack 440 {S64, LLT::scalar(33)}, 441 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 442 .scalarize(0); 443 444 // TODO: Split s1->s64 during regbankselect for VALU. 445 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 447 .lowerFor({{S32, S64}}) 448 .customFor({{S64, S64}}); 449 if (ST.has16BitInsts()) 450 IToFP.legalFor({{S16, S16}}); 451 IToFP.clampScalar(1, S32, S64) 452 .scalarize(0); 453 454 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 455 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 456 if (ST.has16BitInsts()) 457 FPToI.legalFor({{S16, S16}}); 458 else 459 FPToI.minScalar(1, S32); 460 461 FPToI.minScalar(0, S32) 462 .scalarize(0); 463 464 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 465 .legalFor({S32, S64}) 466 .scalarize(0); 467 468 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 469 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 470 .legalFor({S32, S64}) 471 .clampScalar(0, S32, S64) 472 .scalarize(0); 473 } else { 474 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 475 .legalFor({S32}) 476 .customFor({S64}) 477 .clampScalar(0, S32, S64) 478 .scalarize(0); 479 } 480 481 getActionDefinitionsBuilder(G_GEP) 482 .legalForCartesianProduct(AddrSpaces64, {S64}) 483 .legalForCartesianProduct(AddrSpaces32, {S32}) 484 .scalarize(0); 485 486 getActionDefinitionsBuilder(G_PTR_MASK) 487 .scalarize(0) 488 .alwaysLegal(); 489 490 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 491 492 auto &CmpBuilder = 493 getActionDefinitionsBuilder(G_ICMP) 494 .legalForCartesianProduct( 495 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 496 .legalFor({{S1, S32}, {S1, S64}}); 497 if (ST.has16BitInsts()) { 498 CmpBuilder.legalFor({{S1, S16}}); 499 } 500 501 CmpBuilder 502 .widenScalarToNextPow2(1) 503 .clampScalar(1, S32, S64) 504 .scalarize(0) 505 .legalIf(all(typeIs(0, S1), isPointer(1))); 506 507 getActionDefinitionsBuilder(G_FCMP) 508 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 509 .widenScalarToNextPow2(1) 510 .clampScalar(1, S32, S64) 511 .scalarize(0); 512 513 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 514 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 515 G_FLOG, G_FLOG2, G_FLOG10}) 516 .legalFor({S32}) 517 .scalarize(0); 518 519 // The 64-bit versions produce 32-bit results, but only on the SALU. 520 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 521 G_CTTZ, G_CTTZ_ZERO_UNDEF, 522 G_CTPOP}) 523 .legalFor({{S32, S32}, {S32, S64}}) 524 .clampScalar(0, S32, S32) 525 .clampScalar(1, S32, S64) 526 .scalarize(0) 527 .widenScalarToNextPow2(0, 32) 528 .widenScalarToNextPow2(1, 32); 529 530 // TODO: Expand for > s32 531 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 532 .legalFor({S32}) 533 .clampScalar(0, S32, S32) 534 .scalarize(0); 535 536 if (ST.has16BitInsts()) { 537 if (ST.hasVOP3PInsts()) { 538 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 539 .legalFor({S32, S16, V2S16}) 540 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 541 .clampMaxNumElements(0, S16, 2) 542 .clampScalar(0, S16, S32) 543 .widenScalarToNextPow2(0) 544 .scalarize(0); 545 } else { 546 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 547 .legalFor({S32, S16}) 548 .widenScalarToNextPow2(0) 549 .clampScalar(0, S16, S32) 550 .scalarize(0); 551 } 552 } else { 553 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 554 .legalFor({S32}) 555 .clampScalar(0, S32, S32) 556 .widenScalarToNextPow2(0) 557 .scalarize(0); 558 } 559 560 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 561 return [=](const LegalityQuery &Query) { 562 return Query.Types[TypeIdx0].getSizeInBits() < 563 Query.Types[TypeIdx1].getSizeInBits(); 564 }; 565 }; 566 567 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 568 return [=](const LegalityQuery &Query) { 569 return Query.Types[TypeIdx0].getSizeInBits() > 570 Query.Types[TypeIdx1].getSizeInBits(); 571 }; 572 }; 573 574 getActionDefinitionsBuilder(G_INTTOPTR) 575 // List the common cases 576 .legalForCartesianProduct(AddrSpaces64, {S64}) 577 .legalForCartesianProduct(AddrSpaces32, {S32}) 578 .scalarize(0) 579 // Accept any address space as long as the size matches 580 .legalIf(sameSize(0, 1)) 581 .widenScalarIf(smallerThan(1, 0), 582 [](const LegalityQuery &Query) { 583 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 584 }) 585 .narrowScalarIf(greaterThan(1, 0), 586 [](const LegalityQuery &Query) { 587 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 588 }); 589 590 getActionDefinitionsBuilder(G_PTRTOINT) 591 // List the common cases 592 .legalForCartesianProduct(AddrSpaces64, {S64}) 593 .legalForCartesianProduct(AddrSpaces32, {S32}) 594 .scalarize(0) 595 // Accept any address space as long as the size matches 596 .legalIf(sameSize(0, 1)) 597 .widenScalarIf(smallerThan(0, 1), 598 [](const LegalityQuery &Query) { 599 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 600 }) 601 .narrowScalarIf( 602 greaterThan(0, 1), 603 [](const LegalityQuery &Query) { 604 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 605 }); 606 607 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 608 .scalarize(0) 609 .custom(); 610 611 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 612 // handle some operations by just promoting the register during 613 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 614 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 615 switch (AS) { 616 // FIXME: Private element size. 617 case AMDGPUAS::PRIVATE_ADDRESS: 618 return 32; 619 // FIXME: Check subtarget 620 case AMDGPUAS::LOCAL_ADDRESS: 621 return ST.useDS128() ? 128 : 64; 622 623 // Treat constant and global as identical. SMRD loads are sometimes usable 624 // for global loads (ideally constant address space should be eliminated) 625 // depending on the context. Legality cannot be context dependent, but 626 // RegBankSelect can split the load as necessary depending on the pointer 627 // register bank/uniformity and if the memory is invariant or not written in 628 // a kernel. 629 case AMDGPUAS::CONSTANT_ADDRESS: 630 case AMDGPUAS::GLOBAL_ADDRESS: 631 return 512; 632 default: 633 return 128; 634 } 635 }; 636 637 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 638 const LLT DstTy = Query.Types[0]; 639 640 // Split vector extloads. 641 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 642 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 643 return true; 644 645 const LLT PtrTy = Query.Types[1]; 646 unsigned AS = PtrTy.getAddressSpace(); 647 if (MemSize > maxSizeForAddrSpace(AS)) 648 return true; 649 650 // Catch weird sized loads that don't evenly divide into the access sizes 651 // TODO: May be able to widen depending on alignment etc. 652 unsigned NumRegs = MemSize / 32; 653 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 654 return true; 655 656 unsigned Align = Query.MMODescrs[0].AlignInBits; 657 if (Align < MemSize) { 658 const SITargetLowering *TLI = ST.getTargetLowering(); 659 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 660 } 661 662 return false; 663 }; 664 665 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 666 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 667 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 668 669 // TODO: Refine based on subtargets which support unaligned access or 128-bit 670 // LDS 671 // TODO: Unsupported flat for SI. 672 673 for (unsigned Op : {G_LOAD, G_STORE}) { 674 const bool IsStore = Op == G_STORE; 675 676 auto &Actions = getActionDefinitionsBuilder(Op); 677 // Whitelist the common cases. 678 // TODO: Pointer loads 679 // TODO: Wide constant loads 680 // TODO: Only CI+ has 3x loads 681 // TODO: Loads to s16 on gfx9 682 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 683 {V2S32, GlobalPtr, 64, GlobalAlign32}, 684 {V3S32, GlobalPtr, 96, GlobalAlign32}, 685 {S96, GlobalPtr, 96, GlobalAlign32}, 686 {V4S32, GlobalPtr, 128, GlobalAlign32}, 687 {S128, GlobalPtr, 128, GlobalAlign32}, 688 {S64, GlobalPtr, 64, GlobalAlign32}, 689 {V2S64, GlobalPtr, 128, GlobalAlign32}, 690 {V2S16, GlobalPtr, 32, GlobalAlign32}, 691 {S32, GlobalPtr, 8, GlobalAlign8}, 692 {S32, GlobalPtr, 16, GlobalAlign16}, 693 694 {S32, LocalPtr, 32, 32}, 695 {S64, LocalPtr, 64, 32}, 696 {V2S32, LocalPtr, 64, 32}, 697 {S32, LocalPtr, 8, 8}, 698 {S32, LocalPtr, 16, 16}, 699 {V2S16, LocalPtr, 32, 32}, 700 701 {S32, PrivatePtr, 32, 32}, 702 {S32, PrivatePtr, 8, 8}, 703 {S32, PrivatePtr, 16, 16}, 704 {V2S16, PrivatePtr, 32, 32}, 705 706 {S32, FlatPtr, 32, GlobalAlign32}, 707 {S32, FlatPtr, 16, GlobalAlign16}, 708 {S32, FlatPtr, 8, GlobalAlign8}, 709 {V2S16, FlatPtr, 32, GlobalAlign32}, 710 711 {S32, ConstantPtr, 32, GlobalAlign32}, 712 {V2S32, ConstantPtr, 64, GlobalAlign32}, 713 {V3S32, ConstantPtr, 96, GlobalAlign32}, 714 {V4S32, ConstantPtr, 128, GlobalAlign32}, 715 {S64, ConstantPtr, 64, GlobalAlign32}, 716 {S128, ConstantPtr, 128, GlobalAlign32}, 717 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 718 Actions 719 .customIf(typeIs(1, Constant32Ptr)) 720 .narrowScalarIf( 721 [=](const LegalityQuery &Query) -> bool { 722 return !Query.Types[0].isVector() && needToSplitLoad(Query); 723 }, 724 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 725 const LLT DstTy = Query.Types[0]; 726 const LLT PtrTy = Query.Types[1]; 727 728 const unsigned DstSize = DstTy.getSizeInBits(); 729 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 730 731 // Split extloads. 732 if (DstSize > MemSize) 733 return std::make_pair(0, LLT::scalar(MemSize)); 734 735 if (DstSize > 32 && (DstSize % 32 != 0)) { 736 // FIXME: Need a way to specify non-extload of larger size if 737 // suitably aligned. 738 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 739 } 740 741 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 742 if (MemSize > MaxSize) 743 return std::make_pair(0, LLT::scalar(MaxSize)); 744 745 unsigned Align = Query.MMODescrs[0].AlignInBits; 746 return std::make_pair(0, LLT::scalar(Align)); 747 }) 748 .fewerElementsIf( 749 [=](const LegalityQuery &Query) -> bool { 750 return Query.Types[0].isVector() && needToSplitLoad(Query); 751 }, 752 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 753 const LLT DstTy = Query.Types[0]; 754 const LLT PtrTy = Query.Types[1]; 755 756 LLT EltTy = DstTy.getElementType(); 757 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 758 759 // Split if it's too large for the address space. 760 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 761 unsigned NumElts = DstTy.getNumElements(); 762 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 763 764 // FIXME: Refine when odd breakdowns handled 765 // The scalars will need to be re-legalized. 766 if (NumPieces == 1 || NumPieces >= NumElts || 767 NumElts % NumPieces != 0) 768 return std::make_pair(0, EltTy); 769 770 return std::make_pair(0, 771 LLT::vector(NumElts / NumPieces, EltTy)); 772 } 773 774 // Need to split because of alignment. 775 unsigned Align = Query.MMODescrs[0].AlignInBits; 776 unsigned EltSize = EltTy.getSizeInBits(); 777 if (EltSize > Align && 778 (EltSize / Align < DstTy.getNumElements())) { 779 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 780 } 781 782 // May need relegalization for the scalars. 783 return std::make_pair(0, EltTy); 784 }) 785 .minScalar(0, S32); 786 787 if (IsStore) 788 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 789 790 // TODO: Need a bitcast lower option? 791 Actions 792 .legalIf([=](const LegalityQuery &Query) { 793 const LLT Ty0 = Query.Types[0]; 794 unsigned Size = Ty0.getSizeInBits(); 795 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 796 unsigned Align = Query.MMODescrs[0].AlignInBits; 797 798 // No extending vector loads. 799 if (Size > MemSize && Ty0.isVector()) 800 return false; 801 802 // FIXME: Widening store from alignment not valid. 803 if (MemSize < Size) 804 MemSize = std::max(MemSize, Align); 805 806 switch (MemSize) { 807 case 8: 808 case 16: 809 return Size == 32; 810 case 32: 811 case 64: 812 case 128: 813 return true; 814 case 96: 815 return ST.hasDwordx3LoadStores(); 816 case 256: 817 case 512: 818 return true; 819 default: 820 return false; 821 } 822 }) 823 .widenScalarToNextPow2(0) 824 // TODO: v3s32->v4s32 with alignment 825 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 826 } 827 828 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 829 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 830 {S32, GlobalPtr, 16, 2 * 8}, 831 {S32, LocalPtr, 8, 8}, 832 {S32, LocalPtr, 16, 16}, 833 {S32, PrivatePtr, 8, 8}, 834 {S32, PrivatePtr, 16, 16}, 835 {S32, ConstantPtr, 8, 8}, 836 {S32, ConstantPtr, 16, 2 * 8}}); 837 if (ST.hasFlatAddressSpace()) { 838 ExtLoads.legalForTypesWithMemDesc( 839 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 840 } 841 842 ExtLoads.clampScalar(0, S32, S32) 843 .widenScalarToNextPow2(0) 844 .unsupportedIfMemSizeNotPow2() 845 .lower(); 846 847 auto &Atomics = getActionDefinitionsBuilder( 848 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 849 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 850 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 851 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 852 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 853 {S64, GlobalPtr}, {S64, LocalPtr}}); 854 if (ST.hasFlatAddressSpace()) { 855 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 856 } 857 858 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 859 .legalFor({{S32, LocalPtr}}); 860 861 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 862 .lower(); 863 864 // TODO: Pointer types, any 32-bit or 64-bit vector 865 getActionDefinitionsBuilder(G_SELECT) 866 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 867 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 868 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 869 .clampScalar(0, S16, S64) 870 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 871 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 872 .scalarize(1) 873 .clampMaxNumElements(0, S32, 2) 874 .clampMaxNumElements(0, LocalPtr, 2) 875 .clampMaxNumElements(0, PrivatePtr, 2) 876 .scalarize(0) 877 .widenScalarToNextPow2(0) 878 .legalIf(all(isPointer(0), typeIs(1, S1))); 879 880 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 881 // be more flexible with the shift amount type. 882 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 883 .legalFor({{S32, S32}, {S64, S32}}); 884 if (ST.has16BitInsts()) { 885 if (ST.hasVOP3PInsts()) { 886 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 887 .clampMaxNumElements(0, S16, 2); 888 } else 889 Shifts.legalFor({{S16, S32}, {S16, S16}}); 890 891 Shifts.clampScalar(1, S16, S32); 892 Shifts.clampScalar(0, S16, S64); 893 Shifts.widenScalarToNextPow2(0, 16); 894 } else { 895 // Make sure we legalize the shift amount type first, as the general 896 // expansion for the shifted type will produce much worse code if it hasn't 897 // been truncated already. 898 Shifts.clampScalar(1, S32, S32); 899 Shifts.clampScalar(0, S32, S64); 900 Shifts.widenScalarToNextPow2(0, 32); 901 } 902 Shifts.scalarize(0); 903 904 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 905 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 906 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 907 unsigned IdxTypeIdx = 2; 908 909 getActionDefinitionsBuilder(Op) 910 .customIf([=](const LegalityQuery &Query) { 911 const LLT EltTy = Query.Types[EltTypeIdx]; 912 const LLT VecTy = Query.Types[VecTypeIdx]; 913 const LLT IdxTy = Query.Types[IdxTypeIdx]; 914 return (EltTy.getSizeInBits() == 16 || 915 EltTy.getSizeInBits() % 32 == 0) && 916 VecTy.getSizeInBits() % 32 == 0 && 917 VecTy.getSizeInBits() <= 1024 && 918 IdxTy.getSizeInBits() == 32; 919 }) 920 .clampScalar(EltTypeIdx, S32, S64) 921 .clampScalar(VecTypeIdx, S32, S64) 922 .clampScalar(IdxTypeIdx, S32, S32); 923 } 924 925 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 926 .unsupportedIf([=](const LegalityQuery &Query) { 927 const LLT &EltTy = Query.Types[1].getElementType(); 928 return Query.Types[0] != EltTy; 929 }); 930 931 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 932 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 933 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 934 935 // FIXME: Doesn't handle extract of illegal sizes. 936 getActionDefinitionsBuilder(Op) 937 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 938 // FIXME: Multiples of 16 should not be legal. 939 .legalIf([=](const LegalityQuery &Query) { 940 const LLT BigTy = Query.Types[BigTyIdx]; 941 const LLT LitTy = Query.Types[LitTyIdx]; 942 return (BigTy.getSizeInBits() % 32 == 0) && 943 (LitTy.getSizeInBits() % 16 == 0); 944 }) 945 .widenScalarIf( 946 [=](const LegalityQuery &Query) { 947 const LLT BigTy = Query.Types[BigTyIdx]; 948 return (BigTy.getScalarSizeInBits() < 16); 949 }, 950 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 951 .widenScalarIf( 952 [=](const LegalityQuery &Query) { 953 const LLT LitTy = Query.Types[LitTyIdx]; 954 return (LitTy.getScalarSizeInBits() < 16); 955 }, 956 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 957 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 958 .widenScalarToNextPow2(BigTyIdx, 32); 959 960 } 961 962 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 963 .legalForCartesianProduct(AllS32Vectors, {S32}) 964 .legalForCartesianProduct(AllS64Vectors, {S64}) 965 .clampNumElements(0, V16S32, V32S32) 966 .clampNumElements(0, V2S64, V16S64) 967 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 968 969 if (ST.hasScalarPackInsts()) 970 BuildVector.legalFor({V2S16, S32}); 971 972 BuildVector 973 .minScalarSameAs(1, 0) 974 .legalIf(isRegisterType(0)) 975 .minScalarOrElt(0, S32); 976 977 if (ST.hasScalarPackInsts()) { 978 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 979 .legalFor({V2S16, S32}) 980 .lower(); 981 } else { 982 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 983 .lower(); 984 } 985 986 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 987 .legalIf(isRegisterType(0)); 988 989 // TODO: Don't fully scalarize v2s16 pieces 990 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 991 992 // Merge/Unmerge 993 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 994 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 995 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 996 997 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 998 const LLT &Ty = Query.Types[TypeIdx]; 999 if (Ty.isVector()) { 1000 const LLT &EltTy = Ty.getElementType(); 1001 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1002 return true; 1003 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1004 return true; 1005 } 1006 return false; 1007 }; 1008 1009 auto &Builder = getActionDefinitionsBuilder(Op) 1010 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1011 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1012 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1013 // valid. 1014 .clampScalar(LitTyIdx, S16, S256) 1015 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1016 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1017 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1018 elementTypeIs(1, S16)), 1019 changeTo(1, V2S16)) 1020 // Break up vectors with weird elements into scalars 1021 .fewerElementsIf( 1022 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1023 scalarize(0)) 1024 .fewerElementsIf( 1025 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1026 scalarize(1)) 1027 .clampScalar(BigTyIdx, S32, S1024) 1028 .lowerFor({{S16, V2S16}}); 1029 1030 if (Op == G_MERGE_VALUES) { 1031 Builder.widenScalarIf( 1032 // TODO: Use 16-bit shifts if legal for 8-bit values? 1033 [=](const LegalityQuery &Query) { 1034 const LLT Ty = Query.Types[LitTyIdx]; 1035 return Ty.getSizeInBits() < 32; 1036 }, 1037 changeTo(LitTyIdx, S32)); 1038 } 1039 1040 Builder.widenScalarIf( 1041 [=](const LegalityQuery &Query) { 1042 const LLT Ty = Query.Types[BigTyIdx]; 1043 return !isPowerOf2_32(Ty.getSizeInBits()) && 1044 Ty.getSizeInBits() % 16 != 0; 1045 }, 1046 [=](const LegalityQuery &Query) { 1047 // Pick the next power of 2, or a multiple of 64 over 128. 1048 // Whichever is smaller. 1049 const LLT &Ty = Query.Types[BigTyIdx]; 1050 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1051 if (NewSizeInBits >= 256) { 1052 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1053 if (RoundedTo < NewSizeInBits) 1054 NewSizeInBits = RoundedTo; 1055 } 1056 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1057 }) 1058 .legalIf([=](const LegalityQuery &Query) { 1059 const LLT &BigTy = Query.Types[BigTyIdx]; 1060 const LLT &LitTy = Query.Types[LitTyIdx]; 1061 1062 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1063 return false; 1064 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1065 return false; 1066 1067 return BigTy.getSizeInBits() % 16 == 0 && 1068 LitTy.getSizeInBits() % 16 == 0 && 1069 BigTy.getSizeInBits() <= 1024; 1070 }) 1071 // Any vectors left are the wrong size. Scalarize them. 1072 .scalarize(0) 1073 .scalarize(1); 1074 } 1075 1076 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1077 1078 computeTables(); 1079 verify(*ST.getInstrInfo()); 1080 } 1081 1082 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1083 MachineRegisterInfo &MRI, 1084 MachineIRBuilder &B, 1085 GISelChangeObserver &Observer) const { 1086 switch (MI.getOpcode()) { 1087 case TargetOpcode::G_ADDRSPACE_CAST: 1088 return legalizeAddrSpaceCast(MI, MRI, B); 1089 case TargetOpcode::G_FRINT: 1090 return legalizeFrint(MI, MRI, B); 1091 case TargetOpcode::G_FCEIL: 1092 return legalizeFceil(MI, MRI, B); 1093 case TargetOpcode::G_INTRINSIC_TRUNC: 1094 return legalizeIntrinsicTrunc(MI, MRI, B); 1095 case TargetOpcode::G_SITOFP: 1096 return legalizeITOFP(MI, MRI, B, true); 1097 case TargetOpcode::G_UITOFP: 1098 return legalizeITOFP(MI, MRI, B, false); 1099 case TargetOpcode::G_FMINNUM: 1100 case TargetOpcode::G_FMAXNUM: 1101 case TargetOpcode::G_FMINNUM_IEEE: 1102 case TargetOpcode::G_FMAXNUM_IEEE: 1103 return legalizeMinNumMaxNum(MI, MRI, B); 1104 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1105 return legalizeExtractVectorElt(MI, MRI, B); 1106 case TargetOpcode::G_INSERT_VECTOR_ELT: 1107 return legalizeInsertVectorElt(MI, MRI, B); 1108 case TargetOpcode::G_FSIN: 1109 case TargetOpcode::G_FCOS: 1110 return legalizeSinCos(MI, MRI, B); 1111 case TargetOpcode::G_GLOBAL_VALUE: 1112 return legalizeGlobalValue(MI, MRI, B); 1113 case TargetOpcode::G_LOAD: 1114 return legalizeLoad(MI, MRI, B, Observer); 1115 case TargetOpcode::G_FMAD: 1116 return legalizeFMad(MI, MRI, B); 1117 case TargetOpcode::G_FDIV: 1118 return legalizeFDIV(MI, MRI, B); 1119 default: 1120 return false; 1121 } 1122 1123 llvm_unreachable("expected switch to return"); 1124 } 1125 1126 Register AMDGPULegalizerInfo::getSegmentAperture( 1127 unsigned AS, 1128 MachineRegisterInfo &MRI, 1129 MachineIRBuilder &B) const { 1130 MachineFunction &MF = B.getMF(); 1131 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1132 const LLT S32 = LLT::scalar(32); 1133 1134 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1135 1136 if (ST.hasApertureRegs()) { 1137 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1138 // getreg. 1139 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1140 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1141 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1142 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1143 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1144 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1145 unsigned Encoding = 1146 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1147 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1148 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1149 1150 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1151 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1152 1153 B.buildInstr(AMDGPU::S_GETREG_B32) 1154 .addDef(GetReg) 1155 .addImm(Encoding); 1156 MRI.setType(GetReg, S32); 1157 1158 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1159 B.buildInstr(TargetOpcode::G_SHL) 1160 .addDef(ApertureReg) 1161 .addUse(GetReg) 1162 .addUse(ShiftAmt.getReg(0)); 1163 1164 return ApertureReg; 1165 } 1166 1167 Register QueuePtr = MRI.createGenericVirtualRegister( 1168 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1169 1170 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1171 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1172 return Register(); 1173 1174 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1175 // private_segment_aperture_base_hi. 1176 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1177 1178 // FIXME: Don't use undef 1179 Value *V = UndefValue::get(PointerType::get( 1180 Type::getInt8Ty(MF.getFunction().getContext()), 1181 AMDGPUAS::CONSTANT_ADDRESS)); 1182 1183 MachinePointerInfo PtrInfo(V, StructOffset); 1184 MachineMemOperand *MMO = MF.getMachineMemOperand( 1185 PtrInfo, 1186 MachineMemOperand::MOLoad | 1187 MachineMemOperand::MODereferenceable | 1188 MachineMemOperand::MOInvariant, 1189 4, 1190 MinAlign(64, StructOffset)); 1191 1192 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1193 Register LoadAddr; 1194 1195 B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1196 B.buildLoad(LoadResult, LoadAddr, *MMO); 1197 return LoadResult; 1198 } 1199 1200 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1201 MachineInstr &MI, MachineRegisterInfo &MRI, 1202 MachineIRBuilder &B) const { 1203 MachineFunction &MF = B.getMF(); 1204 1205 B.setInstr(MI); 1206 1207 const LLT S32 = LLT::scalar(32); 1208 Register Dst = MI.getOperand(0).getReg(); 1209 Register Src = MI.getOperand(1).getReg(); 1210 1211 LLT DstTy = MRI.getType(Dst); 1212 LLT SrcTy = MRI.getType(Src); 1213 unsigned DestAS = DstTy.getAddressSpace(); 1214 unsigned SrcAS = SrcTy.getAddressSpace(); 1215 1216 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1217 // vector element. 1218 assert(!DstTy.isVector()); 1219 1220 const AMDGPUTargetMachine &TM 1221 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1222 1223 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1224 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1225 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1226 return true; 1227 } 1228 1229 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1230 // Truncate. 1231 B.buildExtract(Dst, Src, 0); 1232 MI.eraseFromParent(); 1233 return true; 1234 } 1235 1236 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1237 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1238 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1239 1240 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1241 // another. Merge operands are required to be the same type, but creating an 1242 // extra ptrtoint would be kind of pointless. 1243 auto HighAddr = B.buildConstant( 1244 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1245 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1246 MI.eraseFromParent(); 1247 return true; 1248 } 1249 1250 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1251 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1252 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1253 unsigned NullVal = TM.getNullPointerValue(DestAS); 1254 1255 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1256 auto FlatNull = B.buildConstant(SrcTy, 0); 1257 1258 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1259 1260 // Extract low 32-bits of the pointer. 1261 B.buildExtract(PtrLo32, Src, 0); 1262 1263 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1264 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1265 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1266 1267 MI.eraseFromParent(); 1268 return true; 1269 } 1270 1271 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1272 return false; 1273 1274 if (!ST.hasFlatAddressSpace()) 1275 return false; 1276 1277 auto SegmentNull = 1278 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1279 auto FlatNull = 1280 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1281 1282 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1283 if (!ApertureReg.isValid()) 1284 return false; 1285 1286 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1287 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1288 1289 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1290 1291 // Coerce the type of the low half of the result so we can use merge_values. 1292 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1293 B.buildInstr(TargetOpcode::G_PTRTOINT) 1294 .addDef(SrcAsInt) 1295 .addUse(Src); 1296 1297 // TODO: Should we allow mismatched types but matching sizes in merges to 1298 // avoid the ptrtoint? 1299 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1300 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1301 1302 MI.eraseFromParent(); 1303 return true; 1304 } 1305 1306 bool AMDGPULegalizerInfo::legalizeFrint( 1307 MachineInstr &MI, MachineRegisterInfo &MRI, 1308 MachineIRBuilder &B) const { 1309 B.setInstr(MI); 1310 1311 Register Src = MI.getOperand(1).getReg(); 1312 LLT Ty = MRI.getType(Src); 1313 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1314 1315 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1316 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1317 1318 auto C1 = B.buildFConstant(Ty, C1Val); 1319 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1320 1321 // TODO: Should this propagate fast-math-flags? 1322 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1323 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1324 1325 auto C2 = B.buildFConstant(Ty, C2Val); 1326 auto Fabs = B.buildFAbs(Ty, Src); 1327 1328 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1329 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1330 return true; 1331 } 1332 1333 bool AMDGPULegalizerInfo::legalizeFceil( 1334 MachineInstr &MI, MachineRegisterInfo &MRI, 1335 MachineIRBuilder &B) const { 1336 B.setInstr(MI); 1337 1338 const LLT S1 = LLT::scalar(1); 1339 const LLT S64 = LLT::scalar(64); 1340 1341 Register Src = MI.getOperand(1).getReg(); 1342 assert(MRI.getType(Src) == S64); 1343 1344 // result = trunc(src) 1345 // if (src > 0.0 && src != result) 1346 // result += 1.0 1347 1348 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1349 1350 const auto Zero = B.buildFConstant(S64, 0.0); 1351 const auto One = B.buildFConstant(S64, 1.0); 1352 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1353 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1354 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1355 auto Add = B.buildSelect(S64, And, One, Zero); 1356 1357 // TODO: Should this propagate fast-math-flags? 1358 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1359 return true; 1360 } 1361 1362 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1363 MachineIRBuilder &B) { 1364 const unsigned FractBits = 52; 1365 const unsigned ExpBits = 11; 1366 LLT S32 = LLT::scalar(32); 1367 1368 auto Const0 = B.buildConstant(S32, FractBits - 32); 1369 auto Const1 = B.buildConstant(S32, ExpBits); 1370 1371 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1372 .addUse(Const0.getReg(0)) 1373 .addUse(Const1.getReg(0)); 1374 1375 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1376 } 1377 1378 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1379 MachineInstr &MI, MachineRegisterInfo &MRI, 1380 MachineIRBuilder &B) const { 1381 B.setInstr(MI); 1382 1383 const LLT S1 = LLT::scalar(1); 1384 const LLT S32 = LLT::scalar(32); 1385 const LLT S64 = LLT::scalar(64); 1386 1387 Register Src = MI.getOperand(1).getReg(); 1388 assert(MRI.getType(Src) == S64); 1389 1390 // TODO: Should this use extract since the low half is unused? 1391 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1392 Register Hi = Unmerge.getReg(1); 1393 1394 // Extract the upper half, since this is where we will find the sign and 1395 // exponent. 1396 auto Exp = extractF64Exponent(Hi, B); 1397 1398 const unsigned FractBits = 52; 1399 1400 // Extract the sign bit. 1401 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1402 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1403 1404 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1405 1406 const auto Zero32 = B.buildConstant(S32, 0); 1407 1408 // Extend back to 64-bits. 1409 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1410 1411 auto Shr = B.buildAShr(S64, FractMask, Exp); 1412 auto Not = B.buildNot(S64, Shr); 1413 auto Tmp0 = B.buildAnd(S64, Src, Not); 1414 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1415 1416 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1417 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1418 1419 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1420 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1421 return true; 1422 } 1423 1424 bool AMDGPULegalizerInfo::legalizeITOFP( 1425 MachineInstr &MI, MachineRegisterInfo &MRI, 1426 MachineIRBuilder &B, bool Signed) const { 1427 B.setInstr(MI); 1428 1429 Register Dst = MI.getOperand(0).getReg(); 1430 Register Src = MI.getOperand(1).getReg(); 1431 1432 const LLT S64 = LLT::scalar(64); 1433 const LLT S32 = LLT::scalar(32); 1434 1435 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1436 1437 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1438 1439 auto CvtHi = Signed ? 1440 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1441 B.buildUITOFP(S64, Unmerge.getReg(1)); 1442 1443 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1444 1445 auto ThirtyTwo = B.buildConstant(S32, 32); 1446 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1447 .addUse(CvtHi.getReg(0)) 1448 .addUse(ThirtyTwo.getReg(0)); 1449 1450 // TODO: Should this propagate fast-math-flags? 1451 B.buildFAdd(Dst, LdExp, CvtLo); 1452 MI.eraseFromParent(); 1453 return true; 1454 } 1455 1456 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1457 MachineInstr &MI, MachineRegisterInfo &MRI, 1458 MachineIRBuilder &B) const { 1459 MachineFunction &MF = B.getMF(); 1460 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1461 1462 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1463 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1464 1465 // With ieee_mode disabled, the instructions have the correct behavior 1466 // already for G_FMINNUM/G_FMAXNUM 1467 if (!MFI->getMode().IEEE) 1468 return !IsIEEEOp; 1469 1470 if (IsIEEEOp) 1471 return true; 1472 1473 MachineIRBuilder HelperBuilder(MI); 1474 GISelObserverWrapper DummyObserver; 1475 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1476 HelperBuilder.setInstr(MI); 1477 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1478 } 1479 1480 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1481 MachineInstr &MI, MachineRegisterInfo &MRI, 1482 MachineIRBuilder &B) const { 1483 // TODO: Should move some of this into LegalizerHelper. 1484 1485 // TODO: Promote dynamic indexing of s16 to s32 1486 // TODO: Dynamic s64 indexing is only legal for SGPR. 1487 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1488 if (!IdxVal) // Dynamic case will be selected to register indexing. 1489 return true; 1490 1491 Register Dst = MI.getOperand(0).getReg(); 1492 Register Vec = MI.getOperand(1).getReg(); 1493 1494 LLT VecTy = MRI.getType(Vec); 1495 LLT EltTy = VecTy.getElementType(); 1496 assert(EltTy == MRI.getType(Dst)); 1497 1498 B.setInstr(MI); 1499 1500 if (IdxVal.getValue() < VecTy.getNumElements()) 1501 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1502 else 1503 B.buildUndef(Dst); 1504 1505 MI.eraseFromParent(); 1506 return true; 1507 } 1508 1509 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1510 MachineInstr &MI, MachineRegisterInfo &MRI, 1511 MachineIRBuilder &B) const { 1512 // TODO: Should move some of this into LegalizerHelper. 1513 1514 // TODO: Promote dynamic indexing of s16 to s32 1515 // TODO: Dynamic s64 indexing is only legal for SGPR. 1516 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1517 if (!IdxVal) // Dynamic case will be selected to register indexing. 1518 return true; 1519 1520 Register Dst = MI.getOperand(0).getReg(); 1521 Register Vec = MI.getOperand(1).getReg(); 1522 Register Ins = MI.getOperand(2).getReg(); 1523 1524 LLT VecTy = MRI.getType(Vec); 1525 LLT EltTy = VecTy.getElementType(); 1526 assert(EltTy == MRI.getType(Ins)); 1527 1528 B.setInstr(MI); 1529 1530 if (IdxVal.getValue() < VecTy.getNumElements()) 1531 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1532 else 1533 B.buildUndef(Dst); 1534 1535 MI.eraseFromParent(); 1536 return true; 1537 } 1538 1539 bool AMDGPULegalizerInfo::legalizeSinCos( 1540 MachineInstr &MI, MachineRegisterInfo &MRI, 1541 MachineIRBuilder &B) const { 1542 B.setInstr(MI); 1543 1544 Register DstReg = MI.getOperand(0).getReg(); 1545 Register SrcReg = MI.getOperand(1).getReg(); 1546 LLT Ty = MRI.getType(DstReg); 1547 unsigned Flags = MI.getFlags(); 1548 1549 Register TrigVal; 1550 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1551 if (ST.hasTrigReducedRange()) { 1552 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1553 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1554 .addUse(MulVal.getReg(0)) 1555 .setMIFlags(Flags).getReg(0); 1556 } else 1557 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1558 1559 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1560 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1561 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1562 .addUse(TrigVal) 1563 .setMIFlags(Flags); 1564 MI.eraseFromParent(); 1565 return true; 1566 } 1567 1568 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1569 Register DstReg, LLT PtrTy, 1570 MachineIRBuilder &B, const GlobalValue *GV, 1571 unsigned Offset, unsigned GAFlags) const { 1572 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1573 // to the following code sequence: 1574 // 1575 // For constant address space: 1576 // s_getpc_b64 s[0:1] 1577 // s_add_u32 s0, s0, $symbol 1578 // s_addc_u32 s1, s1, 0 1579 // 1580 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1581 // a fixup or relocation is emitted to replace $symbol with a literal 1582 // constant, which is a pc-relative offset from the encoding of the $symbol 1583 // operand to the global variable. 1584 // 1585 // For global address space: 1586 // s_getpc_b64 s[0:1] 1587 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1588 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1589 // 1590 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1591 // fixups or relocations are emitted to replace $symbol@*@lo and 1592 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1593 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1594 // operand to the global variable. 1595 // 1596 // What we want here is an offset from the value returned by s_getpc 1597 // (which is the address of the s_add_u32 instruction) to the global 1598 // variable, but since the encoding of $symbol starts 4 bytes after the start 1599 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1600 // small. This requires us to add 4 to the global variable offset in order to 1601 // compute the correct address. 1602 1603 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1604 1605 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1606 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1607 1608 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1609 .addDef(PCReg); 1610 1611 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1612 if (GAFlags == SIInstrInfo::MO_NONE) 1613 MIB.addImm(0); 1614 else 1615 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1616 1617 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1618 1619 if (PtrTy.getSizeInBits() == 32) 1620 B.buildExtract(DstReg, PCReg, 0); 1621 return true; 1622 } 1623 1624 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1625 MachineInstr &MI, MachineRegisterInfo &MRI, 1626 MachineIRBuilder &B) const { 1627 Register DstReg = MI.getOperand(0).getReg(); 1628 LLT Ty = MRI.getType(DstReg); 1629 unsigned AS = Ty.getAddressSpace(); 1630 1631 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1632 MachineFunction &MF = B.getMF(); 1633 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1634 B.setInstr(MI); 1635 1636 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1637 if (!MFI->isEntryFunction()) { 1638 const Function &Fn = MF.getFunction(); 1639 DiagnosticInfoUnsupported BadLDSDecl( 1640 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1641 Fn.getContext().diagnose(BadLDSDecl); 1642 } 1643 1644 // TODO: We could emit code to handle the initialization somewhere. 1645 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1646 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1647 MI.eraseFromParent(); 1648 return true; 1649 } 1650 1651 const Function &Fn = MF.getFunction(); 1652 DiagnosticInfoUnsupported BadInit( 1653 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1654 Fn.getContext().diagnose(BadInit); 1655 return true; 1656 } 1657 1658 const SITargetLowering *TLI = ST.getTargetLowering(); 1659 1660 if (TLI->shouldEmitFixup(GV)) { 1661 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1662 MI.eraseFromParent(); 1663 return true; 1664 } 1665 1666 if (TLI->shouldEmitPCReloc(GV)) { 1667 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1668 MI.eraseFromParent(); 1669 return true; 1670 } 1671 1672 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1673 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1674 1675 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1676 MachinePointerInfo::getGOT(MF), 1677 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1678 MachineMemOperand::MOInvariant, 1679 8 /*Size*/, 8 /*Align*/); 1680 1681 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1682 1683 if (Ty.getSizeInBits() == 32) { 1684 // Truncate if this is a 32-bit constant adrdess. 1685 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1686 B.buildExtract(DstReg, Load, 0); 1687 } else 1688 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1689 1690 MI.eraseFromParent(); 1691 return true; 1692 } 1693 1694 bool AMDGPULegalizerInfo::legalizeLoad( 1695 MachineInstr &MI, MachineRegisterInfo &MRI, 1696 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1697 B.setInstr(MI); 1698 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1699 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1700 Observer.changingInstr(MI); 1701 MI.getOperand(1).setReg(Cast.getReg(0)); 1702 Observer.changedInstr(MI); 1703 return true; 1704 } 1705 1706 bool AMDGPULegalizerInfo::legalizeFMad( 1707 MachineInstr &MI, MachineRegisterInfo &MRI, 1708 MachineIRBuilder &B) const { 1709 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1710 assert(Ty.isScalar()); 1711 1712 // TODO: Always legal with future ftz flag. 1713 if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1714 return true; 1715 if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1716 return true; 1717 1718 MachineFunction &MF = B.getMF(); 1719 1720 MachineIRBuilder HelperBuilder(MI); 1721 GISelObserverWrapper DummyObserver; 1722 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1723 HelperBuilder.setMBB(*MI.getParent()); 1724 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1725 } 1726 1727 // Return the use branch instruction, otherwise null if the usage is invalid. 1728 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1729 MachineRegisterInfo &MRI) { 1730 Register CondDef = MI.getOperand(0).getReg(); 1731 if (!MRI.hasOneNonDBGUse(CondDef)) 1732 return nullptr; 1733 1734 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1735 return UseMI.getParent() == MI.getParent() && 1736 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1737 } 1738 1739 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1740 Register Reg, LLT Ty) const { 1741 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1742 if (LiveIn) 1743 return LiveIn; 1744 1745 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1746 MRI.addLiveIn(Reg, NewReg); 1747 return NewReg; 1748 } 1749 1750 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1751 const ArgDescriptor *Arg) const { 1752 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1753 return false; // TODO: Handle these 1754 1755 assert(Arg->getRegister().isPhysical()); 1756 1757 MachineRegisterInfo &MRI = *B.getMRI(); 1758 1759 LLT Ty = MRI.getType(DstReg); 1760 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1761 1762 if (Arg->isMasked()) { 1763 // TODO: Should we try to emit this once in the entry block? 1764 const LLT S32 = LLT::scalar(32); 1765 const unsigned Mask = Arg->getMask(); 1766 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1767 1768 Register AndMaskSrc = LiveIn; 1769 1770 if (Shift != 0) { 1771 auto ShiftAmt = B.buildConstant(S32, Shift); 1772 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1773 } 1774 1775 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1776 } else 1777 B.buildCopy(DstReg, LiveIn); 1778 1779 // Insert the argument copy if it doens't already exist. 1780 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1781 if (!MRI.getVRegDef(LiveIn)) { 1782 // FIXME: Should have scoped insert pt 1783 MachineBasicBlock &OrigInsBB = B.getMBB(); 1784 auto OrigInsPt = B.getInsertPt(); 1785 1786 MachineBasicBlock &EntryMBB = B.getMF().front(); 1787 EntryMBB.addLiveIn(Arg->getRegister()); 1788 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1789 B.buildCopy(LiveIn, Arg->getRegister()); 1790 1791 B.setInsertPt(OrigInsBB, OrigInsPt); 1792 } 1793 1794 return true; 1795 } 1796 1797 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1798 MachineInstr &MI, 1799 MachineRegisterInfo &MRI, 1800 MachineIRBuilder &B, 1801 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1802 B.setInstr(MI); 1803 1804 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1805 1806 const ArgDescriptor *Arg; 1807 const TargetRegisterClass *RC; 1808 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1809 if (!Arg) { 1810 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1811 return false; 1812 } 1813 1814 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1815 MI.eraseFromParent(); 1816 return true; 1817 } 1818 1819 return false; 1820 } 1821 1822 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1823 MachineRegisterInfo &MRI, 1824 MachineIRBuilder &B) const { 1825 B.setInstr(MI); 1826 1827 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1828 return true; 1829 1830 return false; 1831 } 1832 1833 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1834 MachineRegisterInfo &MRI, 1835 MachineIRBuilder &B) const { 1836 Register Res = MI.getOperand(0).getReg(); 1837 Register LHS = MI.getOperand(1).getReg(); 1838 Register RHS = MI.getOperand(2).getReg(); 1839 1840 uint16_t Flags = MI.getFlags(); 1841 1842 LLT ResTy = MRI.getType(Res); 1843 LLT S32 = LLT::scalar(32); 1844 LLT S64 = LLT::scalar(64); 1845 1846 const MachineFunction &MF = B.getMF(); 1847 bool Unsafe = 1848 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1849 1850 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1851 return false; 1852 1853 if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) 1854 return false; 1855 1856 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1857 // 1 / x -> RCP(x) 1858 if (CLHS->isExactlyValue(1.0)) { 1859 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1860 .addUse(RHS) 1861 .setMIFlags(Flags); 1862 1863 MI.eraseFromParent(); 1864 return true; 1865 } 1866 1867 // -1 / x -> RCP( FNEG(x) ) 1868 if (CLHS->isExactlyValue(-1.0)) { 1869 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1870 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1871 .addUse(FNeg.getReg(0)) 1872 .setMIFlags(Flags); 1873 1874 MI.eraseFromParent(); 1875 return true; 1876 } 1877 } 1878 1879 // x / y -> x * (1.0 / y) 1880 if (Unsafe) { 1881 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1882 .addUse(RHS) 1883 .setMIFlags(Flags); 1884 B.buildFMul(Res, LHS, RCP, Flags); 1885 1886 MI.eraseFromParent(); 1887 return true; 1888 } 1889 1890 return false; 1891 } 1892 1893 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 1894 MachineRegisterInfo &MRI, 1895 MachineIRBuilder &B) const { 1896 B.setInstr(MI); 1897 Register Res = MI.getOperand(0).getReg(); 1898 Register LHS = MI.getOperand(2).getReg(); 1899 Register RHS = MI.getOperand(3).getReg(); 1900 uint16_t Flags = MI.getFlags(); 1901 1902 LLT S32 = LLT::scalar(32); 1903 LLT S1 = LLT::scalar(1); 1904 1905 auto Abs = B.buildFAbs(S32, RHS, Flags); 1906 const APFloat C0Val(1.0f); 1907 1908 auto C0 = B.buildConstant(S32, 0x6f800000); 1909 auto C1 = B.buildConstant(S32, 0x2f800000); 1910 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1911 1912 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1913 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1914 1915 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1916 1917 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1918 .addUse(Mul0.getReg(0)) 1919 .setMIFlags(Flags); 1920 1921 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1922 1923 B.buildFMul(Res, Sel, Mul1, Flags); 1924 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1930 MachineRegisterInfo &MRI, 1931 MachineIRBuilder &B) const { 1932 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1933 if (!MFI->isEntryFunction()) { 1934 return legalizePreloadedArgIntrin(MI, MRI, B, 1935 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1936 } 1937 1938 B.setInstr(MI); 1939 1940 uint64_t Offset = 1941 ST.getTargetLowering()->getImplicitParameterOffset( 1942 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1943 Register DstReg = MI.getOperand(0).getReg(); 1944 LLT DstTy = MRI.getType(DstReg); 1945 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1946 1947 const ArgDescriptor *Arg; 1948 const TargetRegisterClass *RC; 1949 std::tie(Arg, RC) 1950 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1951 if (!Arg) 1952 return false; 1953 1954 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1955 if (!loadInputValue(KernargPtrReg, B, Arg)) 1956 return false; 1957 1958 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1964 MachineRegisterInfo &MRI, 1965 MachineIRBuilder &B, 1966 unsigned AddrSpace) const { 1967 B.setInstr(MI); 1968 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1969 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1970 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1971 MI.eraseFromParent(); 1972 return true; 1973 } 1974 1975 /// Handle register layout difference for f16 images for some subtargets. 1976 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1977 MachineRegisterInfo &MRI, 1978 Register Reg) const { 1979 if (!ST.hasUnpackedD16VMem()) 1980 return Reg; 1981 1982 const LLT S16 = LLT::scalar(16); 1983 const LLT S32 = LLT::scalar(32); 1984 LLT StoreVT = MRI.getType(Reg); 1985 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1986 1987 auto Unmerge = B.buildUnmerge(S16, Reg); 1988 1989 SmallVector<Register, 4> WideRegs; 1990 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1991 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1992 1993 int NumElts = StoreVT.getNumElements(); 1994 1995 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1996 } 1997 1998 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1999 MachineRegisterInfo &MRI, 2000 MachineIRBuilder &B, 2001 bool IsFormat) const { 2002 // TODO: Reject f16 format on targets where unsupported. 2003 Register VData = MI.getOperand(1).getReg(); 2004 LLT Ty = MRI.getType(VData); 2005 2006 B.setInstr(MI); 2007 2008 const LLT S32 = LLT::scalar(32); 2009 const LLT S16 = LLT::scalar(16); 2010 2011 // Fixup illegal register types for i8 stores. 2012 if (Ty == LLT::scalar(8) || Ty == S16) { 2013 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2014 MI.getOperand(1).setReg(AnyExt); 2015 return true; 2016 } 2017 2018 if (Ty.isVector()) { 2019 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2020 if (IsFormat) 2021 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2022 return true; 2023 } 2024 2025 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2026 } 2027 2028 return Ty == S32; 2029 } 2030 2031 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2032 MachineRegisterInfo &MRI, 2033 MachineIRBuilder &B) const { 2034 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2035 switch (MI.getIntrinsicID()) { 2036 case Intrinsic::amdgcn_if: { 2037 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2038 const SIRegisterInfo *TRI 2039 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2040 2041 B.setInstr(*BrCond); 2042 Register Def = MI.getOperand(1).getReg(); 2043 Register Use = MI.getOperand(3).getReg(); 2044 B.buildInstr(AMDGPU::SI_IF) 2045 .addDef(Def) 2046 .addUse(Use) 2047 .addMBB(BrCond->getOperand(1).getMBB()); 2048 2049 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2050 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2051 MI.eraseFromParent(); 2052 BrCond->eraseFromParent(); 2053 return true; 2054 } 2055 2056 return false; 2057 } 2058 case Intrinsic::amdgcn_loop: { 2059 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 2060 const SIRegisterInfo *TRI 2061 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2062 2063 B.setInstr(*BrCond); 2064 Register Reg = MI.getOperand(2).getReg(); 2065 B.buildInstr(AMDGPU::SI_LOOP) 2066 .addUse(Reg) 2067 .addMBB(BrCond->getOperand(1).getMBB()); 2068 MI.eraseFromParent(); 2069 BrCond->eraseFromParent(); 2070 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2071 return true; 2072 } 2073 2074 return false; 2075 } 2076 case Intrinsic::amdgcn_kernarg_segment_ptr: 2077 return legalizePreloadedArgIntrin( 2078 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2079 case Intrinsic::amdgcn_implicitarg_ptr: 2080 return legalizeImplicitArgPtr(MI, MRI, B); 2081 case Intrinsic::amdgcn_workitem_id_x: 2082 return legalizePreloadedArgIntrin(MI, MRI, B, 2083 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2084 case Intrinsic::amdgcn_workitem_id_y: 2085 return legalizePreloadedArgIntrin(MI, MRI, B, 2086 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2087 case Intrinsic::amdgcn_workitem_id_z: 2088 return legalizePreloadedArgIntrin(MI, MRI, B, 2089 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2090 case Intrinsic::amdgcn_workgroup_id_x: 2091 return legalizePreloadedArgIntrin(MI, MRI, B, 2092 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2093 case Intrinsic::amdgcn_workgroup_id_y: 2094 return legalizePreloadedArgIntrin(MI, MRI, B, 2095 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2096 case Intrinsic::amdgcn_workgroup_id_z: 2097 return legalizePreloadedArgIntrin(MI, MRI, B, 2098 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2099 case Intrinsic::amdgcn_dispatch_ptr: 2100 return legalizePreloadedArgIntrin(MI, MRI, B, 2101 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2102 case Intrinsic::amdgcn_queue_ptr: 2103 return legalizePreloadedArgIntrin(MI, MRI, B, 2104 AMDGPUFunctionArgInfo::QUEUE_PTR); 2105 case Intrinsic::amdgcn_implicit_buffer_ptr: 2106 return legalizePreloadedArgIntrin( 2107 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2108 case Intrinsic::amdgcn_dispatch_id: 2109 return legalizePreloadedArgIntrin(MI, MRI, B, 2110 AMDGPUFunctionArgInfo::DISPATCH_ID); 2111 case Intrinsic::amdgcn_fdiv_fast: 2112 return legalizeFDIVFastIntrin(MI, MRI, B); 2113 case Intrinsic::amdgcn_is_shared: 2114 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2115 case Intrinsic::amdgcn_is_private: 2116 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2117 case Intrinsic::amdgcn_wavefrontsize: { 2118 B.setInstr(MI); 2119 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2120 MI.eraseFromParent(); 2121 return true; 2122 } 2123 case Intrinsic::amdgcn_raw_buffer_store: 2124 return legalizeRawBufferStore(MI, MRI, B, false); 2125 case Intrinsic::amdgcn_raw_buffer_store_format: 2126 return legalizeRawBufferStore(MI, MRI, B, true); 2127 default: 2128 return true; 2129 } 2130 2131 return true; 2132 } 2133