1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #if defined(_MSC_VER) || defined(__MINGW32__) 15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16 // from the Visual C++ cmath / math.h headers: 17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18 #define _USE_MATH_DEFINES 19 #endif 20 21 #include "AMDGPU.h" 22 #include "AMDGPULegalizerInfo.h" 23 #include "AMDGPUTargetMachine.h" 24 #include "SIMachineFunctionInfo.h" 25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/TargetOpcodes.h" 28 #include "llvm/CodeGen/ValueTypes.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/DiagnosticInfo.h" 31 #include "llvm/IR/Type.h" 32 #include "llvm/Support/Debug.h" 33 34 #define DEBUG_TYPE "amdgpu-legalinfo" 35 36 using namespace llvm; 37 using namespace LegalizeActions; 38 using namespace LegalizeMutations; 39 using namespace LegalityPredicates; 40 41 42 static LegalityPredicate isMultiple32(unsigned TypeIdx, 43 unsigned MaxSize = 1024) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 const LLT EltTy = Ty.getScalarType(); 47 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 48 }; 49 } 50 51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52 return [=](const LegalityQuery &Query) { 53 return Query.Types[TypeIdx].getSizeInBits() == Size; 54 }; 55 } 56 57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 58 return [=](const LegalityQuery &Query) { 59 const LLT Ty = Query.Types[TypeIdx]; 60 return Ty.isVector() && 61 Ty.getNumElements() % 2 != 0 && 62 Ty.getElementType().getSizeInBits() < 32 && 63 Ty.getSizeInBits() % 32 != 0; 64 }; 65 } 66 67 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 const LLT EltTy = Ty.getScalarType(); 71 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 72 }; 73 } 74 75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 76 return [=](const LegalityQuery &Query) { 77 const LLT Ty = Query.Types[TypeIdx]; 78 const LLT EltTy = Ty.getElementType(); 79 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 80 }; 81 } 82 83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 84 return [=](const LegalityQuery &Query) { 85 const LLT Ty = Query.Types[TypeIdx]; 86 const LLT EltTy = Ty.getElementType(); 87 unsigned Size = Ty.getSizeInBits(); 88 unsigned Pieces = (Size + 63) / 64; 89 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 90 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 91 }; 92 } 93 94 // Increase the number of vector elements to reach the next multiple of 32-bit 95 // type. 96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 100 const LLT EltTy = Ty.getElementType(); 101 const int Size = Ty.getSizeInBits(); 102 const int EltSize = EltTy.getSizeInBits(); 103 const int NextMul32 = (Size + 31) / 32; 104 105 assert(EltSize < 32); 106 107 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109 }; 110 } 111 112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113 return [=](const LegalityQuery &Query) { 114 const LLT QueryTy = Query.Types[TypeIdx]; 115 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116 }; 117 } 118 119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 120 return [=](const LegalityQuery &Query) { 121 const LLT QueryTy = Query.Types[TypeIdx]; 122 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 123 }; 124 } 125 126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 127 return [=](const LegalityQuery &Query) { 128 const LLT QueryTy = Query.Types[TypeIdx]; 129 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 130 }; 131 } 132 133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 134 // v2s16. 135 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 136 return [=](const LegalityQuery &Query) { 137 const LLT Ty = Query.Types[TypeIdx]; 138 if (Ty.isVector()) { 139 const int EltSize = Ty.getElementType().getSizeInBits(); 140 return EltSize == 32 || EltSize == 64 || 141 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 142 EltSize == 128 || EltSize == 256; 143 } 144 145 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146 }; 147 } 148 149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150 return [=](const LegalityQuery &Query) { 151 return Query.Types[TypeIdx].getElementType() == Type; 152 }; 153 } 154 155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156 return [=](const LegalityQuery &Query) { 157 const LLT Ty = Query.Types[TypeIdx]; 158 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 160 }; 161 } 162 163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 164 const GCNTargetMachine &TM) 165 : ST(ST_) { 166 using namespace TargetOpcode; 167 168 auto GetAddrSpacePtr = [&TM](unsigned AS) { 169 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 170 }; 171 172 const LLT S1 = LLT::scalar(1); 173 const LLT S8 = LLT::scalar(8); 174 const LLT S16 = LLT::scalar(16); 175 const LLT S32 = LLT::scalar(32); 176 const LLT S64 = LLT::scalar(64); 177 const LLT S96 = LLT::scalar(96); 178 const LLT S128 = LLT::scalar(128); 179 const LLT S256 = LLT::scalar(256); 180 const LLT S1024 = LLT::scalar(1024); 181 182 const LLT V2S16 = LLT::vector(2, 16); 183 const LLT V4S16 = LLT::vector(4, 16); 184 185 const LLT V2S32 = LLT::vector(2, 32); 186 const LLT V3S32 = LLT::vector(3, 32); 187 const LLT V4S32 = LLT::vector(4, 32); 188 const LLT V5S32 = LLT::vector(5, 32); 189 const LLT V6S32 = LLT::vector(6, 32); 190 const LLT V7S32 = LLT::vector(7, 32); 191 const LLT V8S32 = LLT::vector(8, 32); 192 const LLT V9S32 = LLT::vector(9, 32); 193 const LLT V10S32 = LLT::vector(10, 32); 194 const LLT V11S32 = LLT::vector(11, 32); 195 const LLT V12S32 = LLT::vector(12, 32); 196 const LLT V13S32 = LLT::vector(13, 32); 197 const LLT V14S32 = LLT::vector(14, 32); 198 const LLT V15S32 = LLT::vector(15, 32); 199 const LLT V16S32 = LLT::vector(16, 32); 200 const LLT V32S32 = LLT::vector(32, 32); 201 202 const LLT V2S64 = LLT::vector(2, 64); 203 const LLT V3S64 = LLT::vector(3, 64); 204 const LLT V4S64 = LLT::vector(4, 64); 205 const LLT V5S64 = LLT::vector(5, 64); 206 const LLT V6S64 = LLT::vector(6, 64); 207 const LLT V7S64 = LLT::vector(7, 64); 208 const LLT V8S64 = LLT::vector(8, 64); 209 const LLT V16S64 = LLT::vector(16, 64); 210 211 std::initializer_list<LLT> AllS32Vectors = 212 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 214 std::initializer_list<LLT> AllS64Vectors = 215 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 216 217 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 218 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 220 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 222 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 223 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 224 225 const LLT CodePtr = FlatPtr; 226 227 const std::initializer_list<LLT> AddrSpaces64 = { 228 GlobalPtr, ConstantPtr, FlatPtr 229 }; 230 231 const std::initializer_list<LLT> AddrSpaces32 = { 232 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 233 }; 234 235 const std::initializer_list<LLT> FPTypesBase = { 236 S32, S64 237 }; 238 239 const std::initializer_list<LLT> FPTypes16 = { 240 S32, S64, S16 241 }; 242 243 const std::initializer_list<LLT> FPTypesPK16 = { 244 S32, S64, S16, V2S16 245 }; 246 247 setAction({G_BRCOND, S1}, Legal); // VCC branches 248 setAction({G_BRCOND, S32}, Legal); // SCC branches 249 250 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 251 // elements for v3s16 252 getActionDefinitionsBuilder(G_PHI) 253 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 254 .legalFor(AllS32Vectors) 255 .legalFor(AllS64Vectors) 256 .legalFor(AddrSpaces64) 257 .legalFor(AddrSpaces32) 258 .clampScalar(0, S32, S256) 259 .widenScalarToNextPow2(0, 32) 260 .clampMaxNumElements(0, S32, 16) 261 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 262 .legalIf(isPointer(0)); 263 264 if (ST.has16BitInsts()) { 265 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 266 .legalFor({S32, S16}) 267 .clampScalar(0, S16, S32) 268 .scalarize(0); 269 } else { 270 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 271 .legalFor({S32}) 272 .clampScalar(0, S32, S32) 273 .scalarize(0); 274 } 275 276 // FIXME: Not really legal. Placeholder for custom lowering. 277 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278 .legalFor({S32, S64}) 279 .clampScalar(0, S32, S64) 280 .widenScalarToNextPow2(0, 32) 281 .scalarize(0); 282 283 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 284 .legalFor({S32}) 285 .clampScalar(0, S32, S32) 286 .scalarize(0); 287 288 // Report legal for any types we can handle anywhere. For the cases only legal 289 // on the SALU, RegBankSelect will be able to re-legalize. 290 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 291 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 292 .clampScalar(0, S32, S64) 293 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 294 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 295 .widenScalarToNextPow2(0) 296 .scalarize(0); 297 298 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 299 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300 .legalFor({{S32, S1}, {S32, S32}}) 301 .clampScalar(0, S32, S32) 302 .scalarize(0); // TODO: Implement. 303 304 getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 305 .lower(); 306 307 getActionDefinitionsBuilder(G_BITCAST) 308 // Don't worry about the size constraint. 309 .legalIf(all(isRegisterType(0), isRegisterType(1))) 310 // FIXME: Testing hack 311 .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 312 313 getActionDefinitionsBuilder(G_FCONSTANT) 314 .legalFor({S32, S64, S16}) 315 .clampScalar(0, S16, S64); 316 317 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 318 .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 319 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 320 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 321 .clampScalarOrElt(0, S32, S1024) 322 .legalIf(isMultiple32(0)) 323 .widenScalarToNextPow2(0, 32) 324 .clampMaxNumElements(0, S32, 16); 325 326 327 // FIXME: i1 operands to intrinsics should always be legal, but other i1 328 // values may not be legal. We need to figure out how to distinguish 329 // between these two scenarios. 330 getActionDefinitionsBuilder(G_CONSTANT) 331 .legalFor({S1, S32, S64, S16, GlobalPtr, 332 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 333 .clampScalar(0, S32, S64) 334 .widenScalarToNextPow2(0) 335 .legalIf(isPointer(0)); 336 337 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 338 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 339 .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 340 341 342 auto &FPOpActions = getActionDefinitionsBuilder( 343 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 344 .legalFor({S32, S64}); 345 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 346 .customFor({S32, S64}); 347 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 348 .customFor({S32, S64}); 349 350 if (ST.has16BitInsts()) { 351 if (ST.hasVOP3PInsts()) 352 FPOpActions.legalFor({S16, V2S16}); 353 else 354 FPOpActions.legalFor({S16}); 355 356 TrigActions.customFor({S16}); 357 FDIVActions.customFor({S16}); 358 } 359 360 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 361 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 362 363 if (ST.hasVOP3PInsts()) { 364 MinNumMaxNum.customFor(FPTypesPK16) 365 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 366 .clampMaxNumElements(0, S16, 2) 367 .clampScalar(0, S16, S64) 368 .scalarize(0); 369 } else if (ST.has16BitInsts()) { 370 MinNumMaxNum.customFor(FPTypes16) 371 .clampScalar(0, S16, S64) 372 .scalarize(0); 373 } else { 374 MinNumMaxNum.customFor(FPTypesBase) 375 .clampScalar(0, S32, S64) 376 .scalarize(0); 377 } 378 379 if (ST.hasVOP3PInsts()) 380 FPOpActions.clampMaxNumElements(0, S16, 2); 381 382 FPOpActions 383 .scalarize(0) 384 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 385 386 TrigActions 387 .scalarize(0) 388 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 389 390 FDIVActions 391 .scalarize(0) 392 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 393 394 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 395 .legalFor(FPTypesPK16) 396 .clampMaxNumElements(0, S16, 2) 397 .scalarize(0) 398 .clampScalar(0, S16, S64); 399 400 // TODO: Implement 401 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 402 403 if (ST.has16BitInsts()) { 404 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 405 .legalFor({S32, S64, S16}) 406 .scalarize(0) 407 .clampScalar(0, S16, S64); 408 } else { 409 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 410 .legalFor({S32, S64}) 411 .scalarize(0) 412 .clampScalar(0, S32, S64); 413 } 414 415 getActionDefinitionsBuilder(G_FPTRUNC) 416 .legalFor({{S32, S64}, {S16, S32}}) 417 .scalarize(0); 418 419 getActionDefinitionsBuilder(G_FPEXT) 420 .legalFor({{S64, S32}, {S32, S16}}) 421 .lowerFor({{S64, S16}}) // FIXME: Implement 422 .scalarize(0); 423 424 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 425 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 426 427 getActionDefinitionsBuilder(G_FSUB) 428 // Use actual fsub instruction 429 .legalFor({S32}) 430 // Must use fadd + fneg 431 .lowerFor({S64, S16, V2S16}) 432 .scalarize(0) 433 .clampScalar(0, S32, S64); 434 435 // Whether this is legal depends on the floating point mode for the function. 436 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 437 if (ST.hasMadF16()) 438 FMad.customFor({S32, S16}); 439 else 440 FMad.customFor({S32}); 441 FMad.scalarize(0) 442 .lower(); 443 444 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 445 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 446 {S32, S1}, {S64, S1}, {S16, S1}, 447 {S96, S32}, 448 // FIXME: Hack 449 {S64, LLT::scalar(33)}, 450 {S32, S8}, {S32, LLT::scalar(24)}}) 451 .scalarize(0) 452 .clampScalar(0, S32, S64); 453 454 // TODO: Split s1->s64 during regbankselect for VALU. 455 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 456 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 457 .lowerFor({{S32, S64}}) 458 .lowerIf(typeIs(1, S1)) 459 .customFor({{S64, S64}}); 460 if (ST.has16BitInsts()) 461 IToFP.legalFor({{S16, S16}}); 462 IToFP.clampScalar(1, S32, S64) 463 .scalarize(0); 464 465 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 466 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 467 if (ST.has16BitInsts()) 468 FPToI.legalFor({{S16, S16}}); 469 else 470 FPToI.minScalar(1, S32); 471 472 FPToI.minScalar(0, S32) 473 .scalarize(0); 474 475 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 476 .scalarize(0) 477 .lower(); 478 479 if (ST.has16BitInsts()) { 480 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 481 .legalFor({S16, S32, S64}) 482 .clampScalar(0, S16, S64) 483 .scalarize(0); 484 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 485 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 486 .legalFor({S32, S64}) 487 .clampScalar(0, S32, S64) 488 .scalarize(0); 489 } else { 490 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 491 .legalFor({S32}) 492 .customFor({S64}) 493 .clampScalar(0, S32, S64) 494 .scalarize(0); 495 } 496 497 getActionDefinitionsBuilder(G_PTR_ADD) 498 .legalForCartesianProduct(AddrSpaces64, {S64}) 499 .legalForCartesianProduct(AddrSpaces32, {S32}) 500 .scalarize(0); 501 502 getActionDefinitionsBuilder(G_PTR_MASK) 503 .scalarize(0) 504 .alwaysLegal(); 505 506 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 507 508 auto &CmpBuilder = 509 getActionDefinitionsBuilder(G_ICMP) 510 // The compare output type differs based on the register bank of the output, 511 // so make both s1 and s32 legal. 512 // 513 // Scalar compares producing output in scc will be promoted to s32, as that 514 // is the allocatable register type that will be needed for the copy from 515 // scc. This will be promoted during RegBankSelect, and we assume something 516 // before that won't try to use s32 result types. 517 // 518 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 519 // bank. 520 .legalForCartesianProduct( 521 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 522 .legalForCartesianProduct( 523 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 524 if (ST.has16BitInsts()) { 525 CmpBuilder.legalFor({{S1, S16}}); 526 } 527 528 CmpBuilder 529 .widenScalarToNextPow2(1) 530 .clampScalar(1, S32, S64) 531 .scalarize(0) 532 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 533 534 getActionDefinitionsBuilder(G_FCMP) 535 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 536 .widenScalarToNextPow2(1) 537 .clampScalar(1, S32, S64) 538 .scalarize(0); 539 540 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 541 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 542 G_FLOG, G_FLOG2, G_FLOG10}) 543 .legalFor({S32}) 544 .scalarize(0); 545 546 // The 64-bit versions produce 32-bit results, but only on the SALU. 547 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 548 G_CTTZ, G_CTTZ_ZERO_UNDEF, 549 G_CTPOP}) 550 .legalFor({{S32, S32}, {S32, S64}}) 551 .clampScalar(0, S32, S32) 552 .clampScalar(1, S32, S64) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 32) 555 .widenScalarToNextPow2(1, 32); 556 557 // TODO: Expand for > s32 558 getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 559 .legalFor({S32}) 560 .clampScalar(0, S32, S32) 561 .scalarize(0); 562 563 if (ST.has16BitInsts()) { 564 if (ST.hasVOP3PInsts()) { 565 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 566 .legalFor({S32, S16, V2S16}) 567 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 568 .clampMaxNumElements(0, S16, 2) 569 .clampScalar(0, S16, S32) 570 .widenScalarToNextPow2(0) 571 .scalarize(0); 572 } else { 573 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 574 .legalFor({S32, S16}) 575 .widenScalarToNextPow2(0) 576 .clampScalar(0, S16, S32) 577 .scalarize(0); 578 } 579 } else { 580 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 581 .legalFor({S32}) 582 .clampScalar(0, S32, S32) 583 .widenScalarToNextPow2(0) 584 .scalarize(0); 585 } 586 587 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 588 return [=](const LegalityQuery &Query) { 589 return Query.Types[TypeIdx0].getSizeInBits() < 590 Query.Types[TypeIdx1].getSizeInBits(); 591 }; 592 }; 593 594 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 595 return [=](const LegalityQuery &Query) { 596 return Query.Types[TypeIdx0].getSizeInBits() > 597 Query.Types[TypeIdx1].getSizeInBits(); 598 }; 599 }; 600 601 getActionDefinitionsBuilder(G_INTTOPTR) 602 // List the common cases 603 .legalForCartesianProduct(AddrSpaces64, {S64}) 604 .legalForCartesianProduct(AddrSpaces32, {S32}) 605 .scalarize(0) 606 // Accept any address space as long as the size matches 607 .legalIf(sameSize(0, 1)) 608 .widenScalarIf(smallerThan(1, 0), 609 [](const LegalityQuery &Query) { 610 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 611 }) 612 .narrowScalarIf(greaterThan(1, 0), 613 [](const LegalityQuery &Query) { 614 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 615 }); 616 617 getActionDefinitionsBuilder(G_PTRTOINT) 618 // List the common cases 619 .legalForCartesianProduct(AddrSpaces64, {S64}) 620 .legalForCartesianProduct(AddrSpaces32, {S32}) 621 .scalarize(0) 622 // Accept any address space as long as the size matches 623 .legalIf(sameSize(0, 1)) 624 .widenScalarIf(smallerThan(0, 1), 625 [](const LegalityQuery &Query) { 626 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 627 }) 628 .narrowScalarIf( 629 greaterThan(0, 1), 630 [](const LegalityQuery &Query) { 631 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 632 }); 633 634 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 635 .scalarize(0) 636 .custom(); 637 638 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 639 // handle some operations by just promoting the register during 640 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 641 auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 642 switch (AS) { 643 // FIXME: Private element size. 644 case AMDGPUAS::PRIVATE_ADDRESS: 645 return 32; 646 // FIXME: Check subtarget 647 case AMDGPUAS::LOCAL_ADDRESS: 648 return ST.useDS128() ? 128 : 64; 649 650 // Treat constant and global as identical. SMRD loads are sometimes usable 651 // for global loads (ideally constant address space should be eliminated) 652 // depending on the context. Legality cannot be context dependent, but 653 // RegBankSelect can split the load as necessary depending on the pointer 654 // register bank/uniformity and if the memory is invariant or not written in 655 // a kernel. 656 case AMDGPUAS::CONSTANT_ADDRESS: 657 case AMDGPUAS::GLOBAL_ADDRESS: 658 return 512; 659 default: 660 return 128; 661 } 662 }; 663 664 const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 665 const LLT DstTy = Query.Types[0]; 666 667 // Split vector extloads. 668 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 669 unsigned Align = Query.MMODescrs[0].AlignInBits; 670 671 if (MemSize < DstTy.getSizeInBits()) 672 MemSize = std::max(MemSize, Align); 673 674 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 675 return true; 676 677 const LLT PtrTy = Query.Types[1]; 678 unsigned AS = PtrTy.getAddressSpace(); 679 if (MemSize > maxSizeForAddrSpace(AS)) 680 return true; 681 682 // Catch weird sized loads that don't evenly divide into the access sizes 683 // TODO: May be able to widen depending on alignment etc. 684 unsigned NumRegs = MemSize / 32; 685 if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 686 return true; 687 688 if (Align < MemSize) { 689 const SITargetLowering *TLI = ST.getTargetLowering(); 690 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 691 } 692 693 return false; 694 }; 695 696 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 697 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 698 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 699 700 // TODO: Refine based on subtargets which support unaligned access or 128-bit 701 // LDS 702 // TODO: Unsupported flat for SI. 703 704 for (unsigned Op : {G_LOAD, G_STORE}) { 705 const bool IsStore = Op == G_STORE; 706 707 auto &Actions = getActionDefinitionsBuilder(Op); 708 // Whitelist the common cases. 709 // TODO: Pointer loads 710 // TODO: Wide constant loads 711 // TODO: Only CI+ has 3x loads 712 // TODO: Loads to s16 on gfx9 713 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 714 {V2S32, GlobalPtr, 64, GlobalAlign32}, 715 {V3S32, GlobalPtr, 96, GlobalAlign32}, 716 {S96, GlobalPtr, 96, GlobalAlign32}, 717 {V4S32, GlobalPtr, 128, GlobalAlign32}, 718 {S128, GlobalPtr, 128, GlobalAlign32}, 719 {S64, GlobalPtr, 64, GlobalAlign32}, 720 {V2S64, GlobalPtr, 128, GlobalAlign32}, 721 {V2S16, GlobalPtr, 32, GlobalAlign32}, 722 {S32, GlobalPtr, 8, GlobalAlign8}, 723 {S32, GlobalPtr, 16, GlobalAlign16}, 724 725 {S32, LocalPtr, 32, 32}, 726 {S64, LocalPtr, 64, 32}, 727 {V2S32, LocalPtr, 64, 32}, 728 {S32, LocalPtr, 8, 8}, 729 {S32, LocalPtr, 16, 16}, 730 {V2S16, LocalPtr, 32, 32}, 731 732 {S32, PrivatePtr, 32, 32}, 733 {S32, PrivatePtr, 8, 8}, 734 {S32, PrivatePtr, 16, 16}, 735 {V2S16, PrivatePtr, 32, 32}, 736 737 {S32, FlatPtr, 32, GlobalAlign32}, 738 {S32, FlatPtr, 16, GlobalAlign16}, 739 {S32, FlatPtr, 8, GlobalAlign8}, 740 {V2S16, FlatPtr, 32, GlobalAlign32}, 741 742 {S32, ConstantPtr, 32, GlobalAlign32}, 743 {V2S32, ConstantPtr, 64, GlobalAlign32}, 744 {V3S32, ConstantPtr, 96, GlobalAlign32}, 745 {V4S32, ConstantPtr, 128, GlobalAlign32}, 746 {S64, ConstantPtr, 64, GlobalAlign32}, 747 {S128, ConstantPtr, 128, GlobalAlign32}, 748 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 749 Actions 750 .customIf(typeIs(1, Constant32Ptr)) 751 .narrowScalarIf( 752 [=](const LegalityQuery &Query) -> bool { 753 return !Query.Types[0].isVector() && needToSplitLoad(Query); 754 }, 755 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 756 const LLT DstTy = Query.Types[0]; 757 const LLT PtrTy = Query.Types[1]; 758 759 const unsigned DstSize = DstTy.getSizeInBits(); 760 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 761 762 // Split extloads. 763 if (DstSize > MemSize) 764 return std::make_pair(0, LLT::scalar(MemSize)); 765 766 if (DstSize > 32 && (DstSize % 32 != 0)) { 767 // FIXME: Need a way to specify non-extload of larger size if 768 // suitably aligned. 769 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 770 } 771 772 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 773 if (MemSize > MaxSize) 774 return std::make_pair(0, LLT::scalar(MaxSize)); 775 776 unsigned Align = Query.MMODescrs[0].AlignInBits; 777 return std::make_pair(0, LLT::scalar(Align)); 778 }) 779 .fewerElementsIf( 780 [=](const LegalityQuery &Query) -> bool { 781 return Query.Types[0].isVector() && needToSplitLoad(Query); 782 }, 783 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 784 const LLT DstTy = Query.Types[0]; 785 const LLT PtrTy = Query.Types[1]; 786 787 LLT EltTy = DstTy.getElementType(); 788 unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 789 790 // Split if it's too large for the address space. 791 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 792 unsigned NumElts = DstTy.getNumElements(); 793 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 794 795 // FIXME: Refine when odd breakdowns handled 796 // The scalars will need to be re-legalized. 797 if (NumPieces == 1 || NumPieces >= NumElts || 798 NumElts % NumPieces != 0) 799 return std::make_pair(0, EltTy); 800 801 return std::make_pair(0, 802 LLT::vector(NumElts / NumPieces, EltTy)); 803 } 804 805 // Need to split because of alignment. 806 unsigned Align = Query.MMODescrs[0].AlignInBits; 807 unsigned EltSize = EltTy.getSizeInBits(); 808 if (EltSize > Align && 809 (EltSize / Align < DstTy.getNumElements())) { 810 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 811 } 812 813 // May need relegalization for the scalars. 814 return std::make_pair(0, EltTy); 815 }) 816 .minScalar(0, S32); 817 818 if (IsStore) 819 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 820 821 // TODO: Need a bitcast lower option? 822 Actions 823 .legalIf([=](const LegalityQuery &Query) { 824 const LLT Ty0 = Query.Types[0]; 825 unsigned Size = Ty0.getSizeInBits(); 826 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 827 unsigned Align = Query.MMODescrs[0].AlignInBits; 828 829 // FIXME: Widening store from alignment not valid. 830 if (MemSize < Size) 831 MemSize = std::max(MemSize, Align); 832 833 // No extending vector loads. 834 if (Size > MemSize && Ty0.isVector()) 835 return false; 836 837 switch (MemSize) { 838 case 8: 839 case 16: 840 return Size == 32; 841 case 32: 842 case 64: 843 case 128: 844 return true; 845 case 96: 846 return ST.hasDwordx3LoadStores(); 847 case 256: 848 case 512: 849 return true; 850 default: 851 return false; 852 } 853 }) 854 .widenScalarToNextPow2(0) 855 // TODO: v3s32->v4s32 with alignment 856 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 857 } 858 859 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 860 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 861 {S32, GlobalPtr, 16, 2 * 8}, 862 {S32, LocalPtr, 8, 8}, 863 {S32, LocalPtr, 16, 16}, 864 {S32, PrivatePtr, 8, 8}, 865 {S32, PrivatePtr, 16, 16}, 866 {S32, ConstantPtr, 8, 8}, 867 {S32, ConstantPtr, 16, 2 * 8}}); 868 if (ST.hasFlatAddressSpace()) { 869 ExtLoads.legalForTypesWithMemDesc( 870 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 871 } 872 873 ExtLoads.clampScalar(0, S32, S32) 874 .widenScalarToNextPow2(0) 875 .unsupportedIfMemSizeNotPow2() 876 .lower(); 877 878 auto &Atomics = getActionDefinitionsBuilder( 879 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 880 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 881 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 882 G_ATOMICRMW_UMIN}) 883 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 884 {S64, GlobalPtr}, {S64, LocalPtr}}); 885 if (ST.hasFlatAddressSpace()) { 886 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 887 } 888 889 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 890 .legalFor({{S32, LocalPtr}}); 891 892 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 893 // demarshalling 894 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 895 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 896 {S32, FlatPtr}, {S64, FlatPtr}}) 897 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 898 {S32, RegionPtr}, {S64, RegionPtr}}); 899 900 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 901 .lower(); 902 903 // TODO: Pointer types, any 32-bit or 64-bit vector 904 905 // Condition should be s32 for scalar, s1 for vector. 906 getActionDefinitionsBuilder(G_SELECT) 907 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 908 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 909 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 910 .clampScalar(0, S16, S64) 911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 912 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 913 .scalarize(1) 914 .clampMaxNumElements(0, S32, 2) 915 .clampMaxNumElements(0, LocalPtr, 2) 916 .clampMaxNumElements(0, PrivatePtr, 2) 917 .scalarize(0) 918 .widenScalarToNextPow2(0) 919 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 920 921 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 922 // be more flexible with the shift amount type. 923 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 924 .legalFor({{S32, S32}, {S64, S32}}); 925 if (ST.has16BitInsts()) { 926 if (ST.hasVOP3PInsts()) { 927 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 928 .clampMaxNumElements(0, S16, 2); 929 } else 930 Shifts.legalFor({{S16, S32}, {S16, S16}}); 931 932 // TODO: Support 16-bit shift amounts 933 Shifts.clampScalar(1, S32, S32); 934 Shifts.clampScalar(0, S16, S64); 935 Shifts.widenScalarToNextPow2(0, 16); 936 } else { 937 // Make sure we legalize the shift amount type first, as the general 938 // expansion for the shifted type will produce much worse code if it hasn't 939 // been truncated already. 940 Shifts.clampScalar(1, S32, S32); 941 Shifts.clampScalar(0, S32, S64); 942 Shifts.widenScalarToNextPow2(0, 32); 943 } 944 Shifts.scalarize(0); 945 946 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 947 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 948 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 949 unsigned IdxTypeIdx = 2; 950 951 getActionDefinitionsBuilder(Op) 952 .customIf([=](const LegalityQuery &Query) { 953 const LLT EltTy = Query.Types[EltTypeIdx]; 954 const LLT VecTy = Query.Types[VecTypeIdx]; 955 const LLT IdxTy = Query.Types[IdxTypeIdx]; 956 return (EltTy.getSizeInBits() == 16 || 957 EltTy.getSizeInBits() % 32 == 0) && 958 VecTy.getSizeInBits() % 32 == 0 && 959 VecTy.getSizeInBits() <= 1024 && 960 IdxTy.getSizeInBits() == 32; 961 }) 962 .clampScalar(EltTypeIdx, S32, S64) 963 .clampScalar(VecTypeIdx, S32, S64) 964 .clampScalar(IdxTypeIdx, S32, S32); 965 } 966 967 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 968 .unsupportedIf([=](const LegalityQuery &Query) { 969 const LLT &EltTy = Query.Types[1].getElementType(); 970 return Query.Types[0] != EltTy; 971 }); 972 973 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 974 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 975 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 976 977 // FIXME: Doesn't handle extract of illegal sizes. 978 getActionDefinitionsBuilder(Op) 979 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 980 // FIXME: Multiples of 16 should not be legal. 981 .legalIf([=](const LegalityQuery &Query) { 982 const LLT BigTy = Query.Types[BigTyIdx]; 983 const LLT LitTy = Query.Types[LitTyIdx]; 984 return (BigTy.getSizeInBits() % 32 == 0) && 985 (LitTy.getSizeInBits() % 16 == 0); 986 }) 987 .widenScalarIf( 988 [=](const LegalityQuery &Query) { 989 const LLT BigTy = Query.Types[BigTyIdx]; 990 return (BigTy.getScalarSizeInBits() < 16); 991 }, 992 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 993 .widenScalarIf( 994 [=](const LegalityQuery &Query) { 995 const LLT LitTy = Query.Types[LitTyIdx]; 996 return (LitTy.getScalarSizeInBits() < 16); 997 }, 998 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 999 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1000 .widenScalarToNextPow2(BigTyIdx, 32); 1001 1002 } 1003 1004 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1005 .legalForCartesianProduct(AllS32Vectors, {S32}) 1006 .legalForCartesianProduct(AllS64Vectors, {S64}) 1007 .clampNumElements(0, V16S32, V32S32) 1008 .clampNumElements(0, V2S64, V16S64) 1009 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1010 1011 if (ST.hasScalarPackInsts()) 1012 BuildVector.legalFor({V2S16, S32}); 1013 1014 BuildVector 1015 .minScalarSameAs(1, 0) 1016 .legalIf(isRegisterType(0)) 1017 .minScalarOrElt(0, S32); 1018 1019 if (ST.hasScalarPackInsts()) { 1020 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1021 .legalFor({V2S16, S32}) 1022 .lower(); 1023 } else { 1024 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1025 .lower(); 1026 } 1027 1028 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1029 .legalIf(isRegisterType(0)); 1030 1031 // TODO: Don't fully scalarize v2s16 pieces 1032 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1033 1034 // Merge/Unmerge 1035 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1036 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1037 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1038 1039 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1040 const LLT &Ty = Query.Types[TypeIdx]; 1041 if (Ty.isVector()) { 1042 const LLT &EltTy = Ty.getElementType(); 1043 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 1044 return true; 1045 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1046 return true; 1047 } 1048 return false; 1049 }; 1050 1051 auto &Builder = getActionDefinitionsBuilder(Op) 1052 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1053 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1054 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1055 // valid. 1056 .clampScalar(LitTyIdx, S16, S256) 1057 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1058 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1059 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1060 elementTypeIs(1, S16)), 1061 changeTo(1, V2S16)) 1062 // Break up vectors with weird elements into scalars 1063 .fewerElementsIf( 1064 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 1065 scalarize(0)) 1066 .fewerElementsIf( 1067 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 1068 scalarize(1)) 1069 .clampScalar(BigTyIdx, S32, S1024) 1070 .lowerFor({{S16, V2S16}}); 1071 1072 if (Op == G_MERGE_VALUES) { 1073 Builder.widenScalarIf( 1074 // TODO: Use 16-bit shifts if legal for 8-bit values? 1075 [=](const LegalityQuery &Query) { 1076 const LLT Ty = Query.Types[LitTyIdx]; 1077 return Ty.getSizeInBits() < 32; 1078 }, 1079 changeTo(LitTyIdx, S32)); 1080 } 1081 1082 Builder.widenScalarIf( 1083 [=](const LegalityQuery &Query) { 1084 const LLT Ty = Query.Types[BigTyIdx]; 1085 return !isPowerOf2_32(Ty.getSizeInBits()) && 1086 Ty.getSizeInBits() % 16 != 0; 1087 }, 1088 [=](const LegalityQuery &Query) { 1089 // Pick the next power of 2, or a multiple of 64 over 128. 1090 // Whichever is smaller. 1091 const LLT &Ty = Query.Types[BigTyIdx]; 1092 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1093 if (NewSizeInBits >= 256) { 1094 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1095 if (RoundedTo < NewSizeInBits) 1096 NewSizeInBits = RoundedTo; 1097 } 1098 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1099 }) 1100 .legalIf([=](const LegalityQuery &Query) { 1101 const LLT &BigTy = Query.Types[BigTyIdx]; 1102 const LLT &LitTy = Query.Types[LitTyIdx]; 1103 1104 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1105 return false; 1106 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1107 return false; 1108 1109 return BigTy.getSizeInBits() % 16 == 0 && 1110 LitTy.getSizeInBits() % 16 == 0 && 1111 BigTy.getSizeInBits() <= 1024; 1112 }) 1113 // Any vectors left are the wrong size. Scalarize them. 1114 .scalarize(0) 1115 .scalarize(1); 1116 } 1117 1118 getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1119 1120 getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1121 1122 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1123 .legalFor({S64}); 1124 1125 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1126 G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1127 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1128 .unsupported(); 1129 1130 computeTables(); 1131 verify(*ST.getInstrInfo()); 1132 } 1133 1134 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 1135 MachineRegisterInfo &MRI, 1136 MachineIRBuilder &B, 1137 GISelChangeObserver &Observer) const { 1138 switch (MI.getOpcode()) { 1139 case TargetOpcode::G_ADDRSPACE_CAST: 1140 return legalizeAddrSpaceCast(MI, MRI, B); 1141 case TargetOpcode::G_FRINT: 1142 return legalizeFrint(MI, MRI, B); 1143 case TargetOpcode::G_FCEIL: 1144 return legalizeFceil(MI, MRI, B); 1145 case TargetOpcode::G_INTRINSIC_TRUNC: 1146 return legalizeIntrinsicTrunc(MI, MRI, B); 1147 case TargetOpcode::G_SITOFP: 1148 return legalizeITOFP(MI, MRI, B, true); 1149 case TargetOpcode::G_UITOFP: 1150 return legalizeITOFP(MI, MRI, B, false); 1151 case TargetOpcode::G_FMINNUM: 1152 case TargetOpcode::G_FMAXNUM: 1153 case TargetOpcode::G_FMINNUM_IEEE: 1154 case TargetOpcode::G_FMAXNUM_IEEE: 1155 return legalizeMinNumMaxNum(MI, MRI, B); 1156 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1157 return legalizeExtractVectorElt(MI, MRI, B); 1158 case TargetOpcode::G_INSERT_VECTOR_ELT: 1159 return legalizeInsertVectorElt(MI, MRI, B); 1160 case TargetOpcode::G_FSIN: 1161 case TargetOpcode::G_FCOS: 1162 return legalizeSinCos(MI, MRI, B); 1163 case TargetOpcode::G_GLOBAL_VALUE: 1164 return legalizeGlobalValue(MI, MRI, B); 1165 case TargetOpcode::G_LOAD: 1166 return legalizeLoad(MI, MRI, B, Observer); 1167 case TargetOpcode::G_FMAD: 1168 return legalizeFMad(MI, MRI, B); 1169 case TargetOpcode::G_FDIV: 1170 return legalizeFDIV(MI, MRI, B); 1171 case TargetOpcode::G_ATOMIC_CMPXCHG: 1172 return legalizeAtomicCmpXChg(MI, MRI, B); 1173 default: 1174 return false; 1175 } 1176 1177 llvm_unreachable("expected switch to return"); 1178 } 1179 1180 Register AMDGPULegalizerInfo::getSegmentAperture( 1181 unsigned AS, 1182 MachineRegisterInfo &MRI, 1183 MachineIRBuilder &B) const { 1184 MachineFunction &MF = B.getMF(); 1185 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1186 const LLT S32 = LLT::scalar(32); 1187 1188 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1189 1190 if (ST.hasApertureRegs()) { 1191 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1192 // getreg. 1193 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1194 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1195 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1196 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1197 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1198 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1199 unsigned Encoding = 1200 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1201 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1202 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1203 1204 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 1205 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1206 1207 B.buildInstr(AMDGPU::S_GETREG_B32) 1208 .addDef(GetReg) 1209 .addImm(Encoding); 1210 MRI.setType(GetReg, S32); 1211 1212 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1213 B.buildInstr(TargetOpcode::G_SHL) 1214 .addDef(ApertureReg) 1215 .addUse(GetReg) 1216 .addUse(ShiftAmt.getReg(0)); 1217 1218 return ApertureReg; 1219 } 1220 1221 Register QueuePtr = MRI.createGenericVirtualRegister( 1222 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1223 1224 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1225 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1226 return Register(); 1227 1228 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1229 // private_segment_aperture_base_hi. 1230 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1231 1232 // TODO: can we be smarter about machine pointer info? 1233 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1234 MachineMemOperand *MMO = MF.getMachineMemOperand( 1235 PtrInfo, 1236 MachineMemOperand::MOLoad | 1237 MachineMemOperand::MODereferenceable | 1238 MachineMemOperand::MOInvariant, 1239 4, 1240 MinAlign(64, StructOffset)); 1241 1242 Register LoadResult = MRI.createGenericVirtualRegister(S32); 1243 Register LoadAddr; 1244 1245 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1246 B.buildLoad(LoadResult, LoadAddr, *MMO); 1247 return LoadResult; 1248 } 1249 1250 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1251 MachineInstr &MI, MachineRegisterInfo &MRI, 1252 MachineIRBuilder &B) const { 1253 MachineFunction &MF = B.getMF(); 1254 1255 B.setInstr(MI); 1256 1257 const LLT S32 = LLT::scalar(32); 1258 Register Dst = MI.getOperand(0).getReg(); 1259 Register Src = MI.getOperand(1).getReg(); 1260 1261 LLT DstTy = MRI.getType(Dst); 1262 LLT SrcTy = MRI.getType(Src); 1263 unsigned DestAS = DstTy.getAddressSpace(); 1264 unsigned SrcAS = SrcTy.getAddressSpace(); 1265 1266 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1267 // vector element. 1268 assert(!DstTy.isVector()); 1269 1270 const AMDGPUTargetMachine &TM 1271 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1272 1273 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1274 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1275 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1276 return true; 1277 } 1278 1279 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1280 // Truncate. 1281 B.buildExtract(Dst, Src, 0); 1282 MI.eraseFromParent(); 1283 return true; 1284 } 1285 1286 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1287 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1288 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1289 1290 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1291 // another. Merge operands are required to be the same type, but creating an 1292 // extra ptrtoint would be kind of pointless. 1293 auto HighAddr = B.buildConstant( 1294 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1295 B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1296 MI.eraseFromParent(); 1297 return true; 1298 } 1299 1300 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1301 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1302 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1303 unsigned NullVal = TM.getNullPointerValue(DestAS); 1304 1305 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1306 auto FlatNull = B.buildConstant(SrcTy, 0); 1307 1308 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 1309 1310 // Extract low 32-bits of the pointer. 1311 B.buildExtract(PtrLo32, Src, 0); 1312 1313 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1314 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1315 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1316 1317 MI.eraseFromParent(); 1318 return true; 1319 } 1320 1321 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1322 return false; 1323 1324 if (!ST.hasFlatAddressSpace()) 1325 return false; 1326 1327 auto SegmentNull = 1328 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1329 auto FlatNull = 1330 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1331 1332 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1333 if (!ApertureReg.isValid()) 1334 return false; 1335 1336 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1337 B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 1338 1339 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 1340 1341 // Coerce the type of the low half of the result so we can use merge_values. 1342 Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1343 B.buildInstr(TargetOpcode::G_PTRTOINT) 1344 .addDef(SrcAsInt) 1345 .addUse(Src); 1346 1347 // TODO: Should we allow mismatched types but matching sizes in merges to 1348 // avoid the ptrtoint? 1349 B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1350 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 1351 1352 MI.eraseFromParent(); 1353 return true; 1354 } 1355 1356 bool AMDGPULegalizerInfo::legalizeFrint( 1357 MachineInstr &MI, MachineRegisterInfo &MRI, 1358 MachineIRBuilder &B) const { 1359 B.setInstr(MI); 1360 1361 Register Src = MI.getOperand(1).getReg(); 1362 LLT Ty = MRI.getType(Src); 1363 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1364 1365 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1366 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1367 1368 auto C1 = B.buildFConstant(Ty, C1Val); 1369 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1370 1371 // TODO: Should this propagate fast-math-flags? 1372 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1373 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1374 1375 auto C2 = B.buildFConstant(Ty, C2Val); 1376 auto Fabs = B.buildFAbs(Ty, Src); 1377 1378 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1379 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1380 return true; 1381 } 1382 1383 bool AMDGPULegalizerInfo::legalizeFceil( 1384 MachineInstr &MI, MachineRegisterInfo &MRI, 1385 MachineIRBuilder &B) const { 1386 B.setInstr(MI); 1387 1388 const LLT S1 = LLT::scalar(1); 1389 const LLT S64 = LLT::scalar(64); 1390 1391 Register Src = MI.getOperand(1).getReg(); 1392 assert(MRI.getType(Src) == S64); 1393 1394 // result = trunc(src) 1395 // if (src > 0.0 && src != result) 1396 // result += 1.0 1397 1398 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1399 1400 const auto Zero = B.buildFConstant(S64, 0.0); 1401 const auto One = B.buildFConstant(S64, 1.0); 1402 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1403 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1404 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1405 auto Add = B.buildSelect(S64, And, One, Zero); 1406 1407 // TODO: Should this propagate fast-math-flags? 1408 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1409 return true; 1410 } 1411 1412 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1413 MachineIRBuilder &B) { 1414 const unsigned FractBits = 52; 1415 const unsigned ExpBits = 11; 1416 LLT S32 = LLT::scalar(32); 1417 1418 auto Const0 = B.buildConstant(S32, FractBits - 32); 1419 auto Const1 = B.buildConstant(S32, ExpBits); 1420 1421 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1422 .addUse(Const0.getReg(0)) 1423 .addUse(Const1.getReg(0)); 1424 1425 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1426 } 1427 1428 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1429 MachineInstr &MI, MachineRegisterInfo &MRI, 1430 MachineIRBuilder &B) const { 1431 B.setInstr(MI); 1432 1433 const LLT S1 = LLT::scalar(1); 1434 const LLT S32 = LLT::scalar(32); 1435 const LLT S64 = LLT::scalar(64); 1436 1437 Register Src = MI.getOperand(1).getReg(); 1438 assert(MRI.getType(Src) == S64); 1439 1440 // TODO: Should this use extract since the low half is unused? 1441 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1442 Register Hi = Unmerge.getReg(1); 1443 1444 // Extract the upper half, since this is where we will find the sign and 1445 // exponent. 1446 auto Exp = extractF64Exponent(Hi, B); 1447 1448 const unsigned FractBits = 52; 1449 1450 // Extract the sign bit. 1451 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1452 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1453 1454 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1455 1456 const auto Zero32 = B.buildConstant(S32, 0); 1457 1458 // Extend back to 64-bits. 1459 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1460 1461 auto Shr = B.buildAShr(S64, FractMask, Exp); 1462 auto Not = B.buildNot(S64, Shr); 1463 auto Tmp0 = B.buildAnd(S64, Src, Not); 1464 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1465 1466 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1467 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1468 1469 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1470 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1471 return true; 1472 } 1473 1474 bool AMDGPULegalizerInfo::legalizeITOFP( 1475 MachineInstr &MI, MachineRegisterInfo &MRI, 1476 MachineIRBuilder &B, bool Signed) const { 1477 B.setInstr(MI); 1478 1479 Register Dst = MI.getOperand(0).getReg(); 1480 Register Src = MI.getOperand(1).getReg(); 1481 1482 const LLT S64 = LLT::scalar(64); 1483 const LLT S32 = LLT::scalar(32); 1484 1485 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1486 1487 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1488 1489 auto CvtHi = Signed ? 1490 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1491 B.buildUITOFP(S64, Unmerge.getReg(1)); 1492 1493 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1494 1495 auto ThirtyTwo = B.buildConstant(S32, 32); 1496 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1497 .addUse(CvtHi.getReg(0)) 1498 .addUse(ThirtyTwo.getReg(0)); 1499 1500 // TODO: Should this propagate fast-math-flags? 1501 B.buildFAdd(Dst, LdExp, CvtLo); 1502 MI.eraseFromParent(); 1503 return true; 1504 } 1505 1506 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1507 MachineInstr &MI, MachineRegisterInfo &MRI, 1508 MachineIRBuilder &B) const { 1509 MachineFunction &MF = B.getMF(); 1510 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1511 1512 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1513 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1514 1515 // With ieee_mode disabled, the instructions have the correct behavior 1516 // already for G_FMINNUM/G_FMAXNUM 1517 if (!MFI->getMode().IEEE) 1518 return !IsIEEEOp; 1519 1520 if (IsIEEEOp) 1521 return true; 1522 1523 MachineIRBuilder HelperBuilder(MI); 1524 GISelObserverWrapper DummyObserver; 1525 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1526 HelperBuilder.setInstr(MI); 1527 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1528 } 1529 1530 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1531 MachineInstr &MI, MachineRegisterInfo &MRI, 1532 MachineIRBuilder &B) const { 1533 // TODO: Should move some of this into LegalizerHelper. 1534 1535 // TODO: Promote dynamic indexing of s16 to s32 1536 // TODO: Dynamic s64 indexing is only legal for SGPR. 1537 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1538 if (!IdxVal) // Dynamic case will be selected to register indexing. 1539 return true; 1540 1541 Register Dst = MI.getOperand(0).getReg(); 1542 Register Vec = MI.getOperand(1).getReg(); 1543 1544 LLT VecTy = MRI.getType(Vec); 1545 LLT EltTy = VecTy.getElementType(); 1546 assert(EltTy == MRI.getType(Dst)); 1547 1548 B.setInstr(MI); 1549 1550 if (IdxVal.getValue() < VecTy.getNumElements()) 1551 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1552 else 1553 B.buildUndef(Dst); 1554 1555 MI.eraseFromParent(); 1556 return true; 1557 } 1558 1559 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1560 MachineInstr &MI, MachineRegisterInfo &MRI, 1561 MachineIRBuilder &B) const { 1562 // TODO: Should move some of this into LegalizerHelper. 1563 1564 // TODO: Promote dynamic indexing of s16 to s32 1565 // TODO: Dynamic s64 indexing is only legal for SGPR. 1566 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1567 if (!IdxVal) // Dynamic case will be selected to register indexing. 1568 return true; 1569 1570 Register Dst = MI.getOperand(0).getReg(); 1571 Register Vec = MI.getOperand(1).getReg(); 1572 Register Ins = MI.getOperand(2).getReg(); 1573 1574 LLT VecTy = MRI.getType(Vec); 1575 LLT EltTy = VecTy.getElementType(); 1576 assert(EltTy == MRI.getType(Ins)); 1577 1578 B.setInstr(MI); 1579 1580 if (IdxVal.getValue() < VecTy.getNumElements()) 1581 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1582 else 1583 B.buildUndef(Dst); 1584 1585 MI.eraseFromParent(); 1586 return true; 1587 } 1588 1589 bool AMDGPULegalizerInfo::legalizeSinCos( 1590 MachineInstr &MI, MachineRegisterInfo &MRI, 1591 MachineIRBuilder &B) const { 1592 B.setInstr(MI); 1593 1594 Register DstReg = MI.getOperand(0).getReg(); 1595 Register SrcReg = MI.getOperand(1).getReg(); 1596 LLT Ty = MRI.getType(DstReg); 1597 unsigned Flags = MI.getFlags(); 1598 1599 Register TrigVal; 1600 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1601 if (ST.hasTrigReducedRange()) { 1602 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1603 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1604 .addUse(MulVal.getReg(0)) 1605 .setMIFlags(Flags).getReg(0); 1606 } else 1607 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1608 1609 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1610 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1611 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1612 .addUse(TrigVal) 1613 .setMIFlags(Flags); 1614 MI.eraseFromParent(); 1615 return true; 1616 } 1617 1618 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1619 Register DstReg, LLT PtrTy, 1620 MachineIRBuilder &B, const GlobalValue *GV, 1621 unsigned Offset, unsigned GAFlags) const { 1622 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1623 // to the following code sequence: 1624 // 1625 // For constant address space: 1626 // s_getpc_b64 s[0:1] 1627 // s_add_u32 s0, s0, $symbol 1628 // s_addc_u32 s1, s1, 0 1629 // 1630 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1631 // a fixup or relocation is emitted to replace $symbol with a literal 1632 // constant, which is a pc-relative offset from the encoding of the $symbol 1633 // operand to the global variable. 1634 // 1635 // For global address space: 1636 // s_getpc_b64 s[0:1] 1637 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1638 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1639 // 1640 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1641 // fixups or relocations are emitted to replace $symbol@*@lo and 1642 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1643 // which is a 64-bit pc-relative offset from the encoding of the $symbol 1644 // operand to the global variable. 1645 // 1646 // What we want here is an offset from the value returned by s_getpc 1647 // (which is the address of the s_add_u32 instruction) to the global 1648 // variable, but since the encoding of $symbol starts 4 bytes after the start 1649 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1650 // small. This requires us to add 4 to the global variable offset in order to 1651 // compute the correct address. 1652 1653 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1654 1655 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1656 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1657 1658 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1659 .addDef(PCReg); 1660 1661 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1662 if (GAFlags == SIInstrInfo::MO_NONE) 1663 MIB.addImm(0); 1664 else 1665 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1666 1667 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1668 1669 if (PtrTy.getSizeInBits() == 32) 1670 B.buildExtract(DstReg, PCReg, 0); 1671 return true; 1672 } 1673 1674 bool AMDGPULegalizerInfo::legalizeGlobalValue( 1675 MachineInstr &MI, MachineRegisterInfo &MRI, 1676 MachineIRBuilder &B) const { 1677 Register DstReg = MI.getOperand(0).getReg(); 1678 LLT Ty = MRI.getType(DstReg); 1679 unsigned AS = Ty.getAddressSpace(); 1680 1681 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1682 MachineFunction &MF = B.getMF(); 1683 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1684 B.setInstr(MI); 1685 1686 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1687 if (!MFI->isEntryFunction()) { 1688 const Function &Fn = MF.getFunction(); 1689 DiagnosticInfoUnsupported BadLDSDecl( 1690 Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1691 Fn.getContext().diagnose(BadLDSDecl); 1692 } 1693 1694 // TODO: We could emit code to handle the initialization somewhere. 1695 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1696 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1697 MI.eraseFromParent(); 1698 return true; 1699 } 1700 1701 const Function &Fn = MF.getFunction(); 1702 DiagnosticInfoUnsupported BadInit( 1703 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1704 Fn.getContext().diagnose(BadInit); 1705 return true; 1706 } 1707 1708 const SITargetLowering *TLI = ST.getTargetLowering(); 1709 1710 if (TLI->shouldEmitFixup(GV)) { 1711 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1712 MI.eraseFromParent(); 1713 return true; 1714 } 1715 1716 if (TLI->shouldEmitPCReloc(GV)) { 1717 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1718 MI.eraseFromParent(); 1719 return true; 1720 } 1721 1722 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1723 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1724 1725 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1726 MachinePointerInfo::getGOT(MF), 1727 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1728 MachineMemOperand::MOInvariant, 1729 8 /*Size*/, 8 /*Align*/); 1730 1731 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1732 1733 if (Ty.getSizeInBits() == 32) { 1734 // Truncate if this is a 32-bit constant adrdess. 1735 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1736 B.buildExtract(DstReg, Load, 0); 1737 } else 1738 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1739 1740 MI.eraseFromParent(); 1741 return true; 1742 } 1743 1744 bool AMDGPULegalizerInfo::legalizeLoad( 1745 MachineInstr &MI, MachineRegisterInfo &MRI, 1746 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1747 B.setInstr(MI); 1748 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1749 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1750 Observer.changingInstr(MI); 1751 MI.getOperand(1).setReg(Cast.getReg(0)); 1752 Observer.changedInstr(MI); 1753 return true; 1754 } 1755 1756 bool AMDGPULegalizerInfo::legalizeFMad( 1757 MachineInstr &MI, MachineRegisterInfo &MRI, 1758 MachineIRBuilder &B) const { 1759 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1760 assert(Ty.isScalar()); 1761 1762 MachineFunction &MF = B.getMF(); 1763 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1764 1765 // TODO: Always legal with future ftz flag. 1766 if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 1767 return true; 1768 if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 1769 return true; 1770 1771 1772 MachineIRBuilder HelperBuilder(MI); 1773 GISelObserverWrapper DummyObserver; 1774 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1775 HelperBuilder.setMBB(*MI.getParent()); 1776 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1777 } 1778 1779 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1780 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1781 Register DstReg = MI.getOperand(0).getReg(); 1782 Register PtrReg = MI.getOperand(1).getReg(); 1783 Register CmpVal = MI.getOperand(2).getReg(); 1784 Register NewVal = MI.getOperand(3).getReg(); 1785 1786 assert(SITargetLowering::isFlatGlobalAddrSpace( 1787 MRI.getType(PtrReg).getAddressSpace()) && 1788 "this should not have been custom lowered"); 1789 1790 LLT ValTy = MRI.getType(CmpVal); 1791 LLT VecTy = LLT::vector(2, ValTy); 1792 1793 B.setInstr(MI); 1794 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1795 1796 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1797 .addDef(DstReg) 1798 .addUse(PtrReg) 1799 .addUse(PackedVal) 1800 .setMemRefs(MI.memoperands()); 1801 1802 MI.eraseFromParent(); 1803 return true; 1804 } 1805 1806 // Return the use branch instruction, otherwise null if the usage is invalid. 1807 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1808 MachineRegisterInfo &MRI, 1809 MachineInstr *&Br) { 1810 Register CondDef = MI.getOperand(0).getReg(); 1811 if (!MRI.hasOneNonDBGUse(CondDef)) 1812 return nullptr; 1813 1814 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1815 if (UseMI.getParent() != MI.getParent() || 1816 UseMI.getOpcode() != AMDGPU::G_BRCOND) 1817 return nullptr; 1818 1819 // Make sure the cond br is followed by a G_BR 1820 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1821 if (Next != MI.getParent()->end()) { 1822 if (Next->getOpcode() != AMDGPU::G_BR) 1823 return nullptr; 1824 Br = &*Next; 1825 } 1826 1827 return &UseMI; 1828 } 1829 1830 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1831 Register Reg, LLT Ty) const { 1832 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1833 if (LiveIn) 1834 return LiveIn; 1835 1836 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1837 MRI.addLiveIn(Reg, NewReg); 1838 return NewReg; 1839 } 1840 1841 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1842 const ArgDescriptor *Arg) const { 1843 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 1844 return false; // TODO: Handle these 1845 1846 assert(Arg->getRegister().isPhysical()); 1847 1848 MachineRegisterInfo &MRI = *B.getMRI(); 1849 1850 LLT Ty = MRI.getType(DstReg); 1851 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1852 1853 if (Arg->isMasked()) { 1854 // TODO: Should we try to emit this once in the entry block? 1855 const LLT S32 = LLT::scalar(32); 1856 const unsigned Mask = Arg->getMask(); 1857 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1858 1859 Register AndMaskSrc = LiveIn; 1860 1861 if (Shift != 0) { 1862 auto ShiftAmt = B.buildConstant(S32, Shift); 1863 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1864 } 1865 1866 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 1867 } else 1868 B.buildCopy(DstReg, LiveIn); 1869 1870 // Insert the argument copy if it doens't already exist. 1871 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1872 if (!MRI.getVRegDef(LiveIn)) { 1873 // FIXME: Should have scoped insert pt 1874 MachineBasicBlock &OrigInsBB = B.getMBB(); 1875 auto OrigInsPt = B.getInsertPt(); 1876 1877 MachineBasicBlock &EntryMBB = B.getMF().front(); 1878 EntryMBB.addLiveIn(Arg->getRegister()); 1879 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1880 B.buildCopy(LiveIn, Arg->getRegister()); 1881 1882 B.setInsertPt(OrigInsBB, OrigInsPt); 1883 } 1884 1885 return true; 1886 } 1887 1888 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1889 MachineInstr &MI, 1890 MachineRegisterInfo &MRI, 1891 MachineIRBuilder &B, 1892 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1893 B.setInstr(MI); 1894 1895 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1896 1897 const ArgDescriptor *Arg; 1898 const TargetRegisterClass *RC; 1899 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1900 if (!Arg) { 1901 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1902 return false; 1903 } 1904 1905 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1906 MI.eraseFromParent(); 1907 return true; 1908 } 1909 1910 return false; 1911 } 1912 1913 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1914 MachineRegisterInfo &MRI, 1915 MachineIRBuilder &B) const { 1916 B.setInstr(MI); 1917 Register Dst = MI.getOperand(0).getReg(); 1918 LLT DstTy = MRI.getType(Dst); 1919 LLT S16 = LLT::scalar(16); 1920 LLT S32 = LLT::scalar(32); 1921 LLT S64 = LLT::scalar(64); 1922 1923 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1924 return true; 1925 1926 if (DstTy == S16) 1927 return legalizeFDIV16(MI, MRI, B); 1928 if (DstTy == S32) 1929 return legalizeFDIV32(MI, MRI, B); 1930 if (DstTy == S64) 1931 return legalizeFDIV64(MI, MRI, B); 1932 1933 return false; 1934 } 1935 1936 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1937 MachineRegisterInfo &MRI, 1938 MachineIRBuilder &B) const { 1939 Register Res = MI.getOperand(0).getReg(); 1940 Register LHS = MI.getOperand(1).getReg(); 1941 Register RHS = MI.getOperand(2).getReg(); 1942 1943 uint16_t Flags = MI.getFlags(); 1944 1945 LLT ResTy = MRI.getType(Res); 1946 LLT S32 = LLT::scalar(32); 1947 LLT S64 = LLT::scalar(64); 1948 1949 const MachineFunction &MF = B.getMF(); 1950 bool Unsafe = 1951 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1952 1953 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1954 return false; 1955 1956 if (!Unsafe && ResTy == S32 && 1957 MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 1958 return false; 1959 1960 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1961 // 1 / x -> RCP(x) 1962 if (CLHS->isExactlyValue(1.0)) { 1963 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1964 .addUse(RHS) 1965 .setMIFlags(Flags); 1966 1967 MI.eraseFromParent(); 1968 return true; 1969 } 1970 1971 // -1 / x -> RCP( FNEG(x) ) 1972 if (CLHS->isExactlyValue(-1.0)) { 1973 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1974 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1975 .addUse(FNeg.getReg(0)) 1976 .setMIFlags(Flags); 1977 1978 MI.eraseFromParent(); 1979 return true; 1980 } 1981 } 1982 1983 // x / y -> x * (1.0 / y) 1984 if (Unsafe) { 1985 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1986 .addUse(RHS) 1987 .setMIFlags(Flags); 1988 B.buildFMul(Res, LHS, RCP, Flags); 1989 1990 MI.eraseFromParent(); 1991 return true; 1992 } 1993 1994 return false; 1995 } 1996 1997 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1998 MachineRegisterInfo &MRI, 1999 MachineIRBuilder &B) const { 2000 B.setInstr(MI); 2001 Register Res = MI.getOperand(0).getReg(); 2002 Register LHS = MI.getOperand(1).getReg(); 2003 Register RHS = MI.getOperand(2).getReg(); 2004 2005 uint16_t Flags = MI.getFlags(); 2006 2007 LLT S16 = LLT::scalar(16); 2008 LLT S32 = LLT::scalar(32); 2009 2010 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2011 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2012 2013 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2014 .addUse(RHSExt.getReg(0)) 2015 .setMIFlags(Flags); 2016 2017 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2018 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2019 2020 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2021 .addUse(RDst.getReg(0)) 2022 .addUse(RHS) 2023 .addUse(LHS) 2024 .setMIFlags(Flags); 2025 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2031 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2032 static void toggleSPDenormMode(bool Enable, 2033 MachineIRBuilder &B, 2034 const GCNSubtarget &ST, 2035 AMDGPU::SIModeRegisterDefaults Mode) { 2036 // Set SP denorm mode to this value. 2037 unsigned SPDenormMode = 2038 Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2039 2040 if (ST.hasDenormModeInst()) { 2041 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2042 unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2043 ? FP_DENORM_FLUSH_NONE 2044 : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2045 2046 unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2047 B.buildInstr(AMDGPU::S_DENORM_MODE) 2048 .addImm(NewDenormModeValue); 2049 2050 } else { 2051 // Select FP32 bit field in mode register. 2052 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2053 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2054 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2055 2056 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2057 .addImm(SPDenormMode) 2058 .addImm(SPDenormModeBitField); 2059 } 2060 } 2061 2062 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2063 MachineRegisterInfo &MRI, 2064 MachineIRBuilder &B) const { 2065 B.setInstr(MI); 2066 Register Res = MI.getOperand(0).getReg(); 2067 Register LHS = MI.getOperand(1).getReg(); 2068 Register RHS = MI.getOperand(2).getReg(); 2069 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2070 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2071 2072 uint16_t Flags = MI.getFlags(); 2073 2074 LLT S32 = LLT::scalar(32); 2075 LLT S1 = LLT::scalar(1); 2076 2077 auto One = B.buildFConstant(S32, 1.0f); 2078 2079 auto DenominatorScaled = 2080 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2081 .addUse(RHS) 2082 .addUse(LHS) 2083 .addImm(1) 2084 .setMIFlags(Flags); 2085 auto NumeratorScaled = 2086 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2087 .addUse(LHS) 2088 .addUse(RHS) 2089 .addImm(0) 2090 .setMIFlags(Flags); 2091 2092 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2093 .addUse(DenominatorScaled.getReg(0)) 2094 .setMIFlags(Flags); 2095 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2096 2097 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2098 // aren't modeled as reading it. 2099 if (!Mode.FP32Denormals) 2100 toggleSPDenormMode(true, B, ST, Mode); 2101 2102 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2103 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2104 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2105 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2106 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2107 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2108 2109 if (!Mode.FP32Denormals) 2110 toggleSPDenormMode(false, B, ST, Mode); 2111 2112 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2113 .addUse(Fma4.getReg(0)) 2114 .addUse(Fma1.getReg(0)) 2115 .addUse(Fma3.getReg(0)) 2116 .addUse(NumeratorScaled.getReg(1)) 2117 .setMIFlags(Flags); 2118 2119 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2120 .addUse(Fmas.getReg(0)) 2121 .addUse(RHS) 2122 .addUse(LHS) 2123 .setMIFlags(Flags); 2124 2125 MI.eraseFromParent(); 2126 return true; 2127 } 2128 2129 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2130 MachineRegisterInfo &MRI, 2131 MachineIRBuilder &B) const { 2132 B.setInstr(MI); 2133 Register Res = MI.getOperand(0).getReg(); 2134 Register LHS = MI.getOperand(1).getReg(); 2135 Register RHS = MI.getOperand(2).getReg(); 2136 2137 uint16_t Flags = MI.getFlags(); 2138 2139 LLT S64 = LLT::scalar(64); 2140 LLT S1 = LLT::scalar(1); 2141 2142 auto One = B.buildFConstant(S64, 1.0); 2143 2144 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2145 .addUse(LHS) 2146 .addUse(RHS) 2147 .addImm(1) 2148 .setMIFlags(Flags); 2149 2150 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2151 2152 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2153 .addUse(DivScale0.getReg(0)) 2154 .setMIFlags(Flags); 2155 2156 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2157 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2158 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2159 2160 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2161 .addUse(LHS) 2162 .addUse(RHS) 2163 .addImm(0) 2164 .setMIFlags(Flags); 2165 2166 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2167 auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2168 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2169 2170 Register Scale; 2171 if (!ST.hasUsableDivScaleConditionOutput()) { 2172 // Workaround a hardware bug on SI where the condition output from div_scale 2173 // is not usable. 2174 2175 Scale = MRI.createGenericVirtualRegister(S1); 2176 2177 LLT S32 = LLT::scalar(32); 2178 2179 auto NumUnmerge = B.buildUnmerge(S32, LHS); 2180 auto DenUnmerge = B.buildUnmerge(S32, RHS); 2181 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2182 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2183 2184 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2185 Scale1Unmerge.getReg(1)); 2186 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2187 Scale0Unmerge.getReg(1)); 2188 B.buildXor(Scale, CmpNum, CmpDen); 2189 } else { 2190 Scale = DivScale1.getReg(1); 2191 } 2192 2193 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2194 .addUse(Fma4.getReg(0)) 2195 .addUse(Fma3.getReg(0)) 2196 .addUse(Mul.getReg(0)) 2197 .addUse(Scale) 2198 .setMIFlags(Flags); 2199 2200 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2201 .addUse(Fmas.getReg(0)) 2202 .addUse(RHS) 2203 .addUse(LHS) 2204 .setMIFlags(Flags); 2205 2206 MI.eraseFromParent(); 2207 return true; 2208 } 2209 2210 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 2211 MachineRegisterInfo &MRI, 2212 MachineIRBuilder &B) const { 2213 B.setInstr(MI); 2214 Register Res = MI.getOperand(0).getReg(); 2215 Register LHS = MI.getOperand(2).getReg(); 2216 Register RHS = MI.getOperand(3).getReg(); 2217 uint16_t Flags = MI.getFlags(); 2218 2219 LLT S32 = LLT::scalar(32); 2220 LLT S1 = LLT::scalar(1); 2221 2222 auto Abs = B.buildFAbs(S32, RHS, Flags); 2223 const APFloat C0Val(1.0f); 2224 2225 auto C0 = B.buildConstant(S32, 0x6f800000); 2226 auto C1 = B.buildConstant(S32, 0x2f800000); 2227 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 2228 2229 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 2230 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 2231 2232 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 2233 2234 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2235 .addUse(Mul0.getReg(0)) 2236 .setMIFlags(Flags); 2237 2238 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 2239 2240 B.buildFMul(Res, Sel, Mul1, Flags); 2241 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 2247 MachineRegisterInfo &MRI, 2248 MachineIRBuilder &B) const { 2249 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2250 if (!MFI->isEntryFunction()) { 2251 return legalizePreloadedArgIntrin(MI, MRI, B, 2252 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 2253 } 2254 2255 B.setInstr(MI); 2256 2257 uint64_t Offset = 2258 ST.getTargetLowering()->getImplicitParameterOffset( 2259 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 2260 Register DstReg = MI.getOperand(0).getReg(); 2261 LLT DstTy = MRI.getType(DstReg); 2262 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 2263 2264 const ArgDescriptor *Arg; 2265 const TargetRegisterClass *RC; 2266 std::tie(Arg, RC) 2267 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2268 if (!Arg) 2269 return false; 2270 2271 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 2272 if (!loadInputValue(KernargPtrReg, B, Arg)) 2273 return false; 2274 2275 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 2276 MI.eraseFromParent(); 2277 return true; 2278 } 2279 2280 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 2281 MachineRegisterInfo &MRI, 2282 MachineIRBuilder &B, 2283 unsigned AddrSpace) const { 2284 B.setInstr(MI); 2285 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 2286 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 2287 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 2288 MI.eraseFromParent(); 2289 return true; 2290 } 2291 2292 /// Handle register layout difference for f16 images for some subtargets. 2293 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 2294 MachineRegisterInfo &MRI, 2295 Register Reg) const { 2296 if (!ST.hasUnpackedD16VMem()) 2297 return Reg; 2298 2299 const LLT S16 = LLT::scalar(16); 2300 const LLT S32 = LLT::scalar(32); 2301 LLT StoreVT = MRI.getType(Reg); 2302 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 2303 2304 auto Unmerge = B.buildUnmerge(S16, Reg); 2305 2306 SmallVector<Register, 4> WideRegs; 2307 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 2308 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 2309 2310 int NumElts = StoreVT.getNumElements(); 2311 2312 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 2313 } 2314 2315 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 2316 MachineRegisterInfo &MRI, 2317 MachineIRBuilder &B, 2318 bool IsFormat) const { 2319 // TODO: Reject f16 format on targets where unsupported. 2320 Register VData = MI.getOperand(1).getReg(); 2321 LLT Ty = MRI.getType(VData); 2322 2323 B.setInstr(MI); 2324 2325 const LLT S32 = LLT::scalar(32); 2326 const LLT S16 = LLT::scalar(16); 2327 2328 // Fixup illegal register types for i8 stores. 2329 if (Ty == LLT::scalar(8) || Ty == S16) { 2330 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2331 MI.getOperand(1).setReg(AnyExt); 2332 return true; 2333 } 2334 2335 if (Ty.isVector()) { 2336 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2337 if (IsFormat) 2338 MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2339 return true; 2340 } 2341 2342 return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2343 } 2344 2345 return Ty == S32; 2346 } 2347 2348 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 2349 MachineRegisterInfo &MRI, 2350 MachineIRBuilder &B) const { 2351 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2352 auto IntrID = MI.getIntrinsicID(); 2353 switch (IntrID) { 2354 case Intrinsic::amdgcn_if: 2355 case Intrinsic::amdgcn_else: { 2356 MachineInstr *Br = nullptr; 2357 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2358 const SIRegisterInfo *TRI 2359 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2360 2361 B.setInstr(*BrCond); 2362 Register Def = MI.getOperand(1).getReg(); 2363 Register Use = MI.getOperand(3).getReg(); 2364 2365 MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2366 if (Br) 2367 BrTarget = Br->getOperand(0).getMBB(); 2368 2369 if (IntrID == Intrinsic::amdgcn_if) { 2370 B.buildInstr(AMDGPU::SI_IF) 2371 .addDef(Def) 2372 .addUse(Use) 2373 .addMBB(BrTarget); 2374 } else { 2375 B.buildInstr(AMDGPU::SI_ELSE) 2376 .addDef(Def) 2377 .addUse(Use) 2378 .addMBB(BrTarget) 2379 .addImm(0); 2380 } 2381 2382 if (Br) 2383 Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 2384 2385 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 2386 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 2387 MI.eraseFromParent(); 2388 BrCond->eraseFromParent(); 2389 return true; 2390 } 2391 2392 return false; 2393 } 2394 case Intrinsic::amdgcn_loop: { 2395 MachineInstr *Br = nullptr; 2396 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 2397 const SIRegisterInfo *TRI 2398 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 2399 2400 B.setInstr(*BrCond); 2401 2402 // FIXME: Need to adjust branch targets based on unconditional branch. 2403 Register Reg = MI.getOperand(2).getReg(); 2404 B.buildInstr(AMDGPU::SI_LOOP) 2405 .addUse(Reg) 2406 .addMBB(BrCond->getOperand(1).getMBB()); 2407 MI.eraseFromParent(); 2408 BrCond->eraseFromParent(); 2409 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 2410 return true; 2411 } 2412 2413 return false; 2414 } 2415 case Intrinsic::amdgcn_kernarg_segment_ptr: 2416 return legalizePreloadedArgIntrin( 2417 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 2418 case Intrinsic::amdgcn_implicitarg_ptr: 2419 return legalizeImplicitArgPtr(MI, MRI, B); 2420 case Intrinsic::amdgcn_workitem_id_x: 2421 return legalizePreloadedArgIntrin(MI, MRI, B, 2422 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 2423 case Intrinsic::amdgcn_workitem_id_y: 2424 return legalizePreloadedArgIntrin(MI, MRI, B, 2425 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 2426 case Intrinsic::amdgcn_workitem_id_z: 2427 return legalizePreloadedArgIntrin(MI, MRI, B, 2428 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 2429 case Intrinsic::amdgcn_workgroup_id_x: 2430 return legalizePreloadedArgIntrin(MI, MRI, B, 2431 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 2432 case Intrinsic::amdgcn_workgroup_id_y: 2433 return legalizePreloadedArgIntrin(MI, MRI, B, 2434 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 2435 case Intrinsic::amdgcn_workgroup_id_z: 2436 return legalizePreloadedArgIntrin(MI, MRI, B, 2437 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 2438 case Intrinsic::amdgcn_dispatch_ptr: 2439 return legalizePreloadedArgIntrin(MI, MRI, B, 2440 AMDGPUFunctionArgInfo::DISPATCH_PTR); 2441 case Intrinsic::amdgcn_queue_ptr: 2442 return legalizePreloadedArgIntrin(MI, MRI, B, 2443 AMDGPUFunctionArgInfo::QUEUE_PTR); 2444 case Intrinsic::amdgcn_implicit_buffer_ptr: 2445 return legalizePreloadedArgIntrin( 2446 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 2447 case Intrinsic::amdgcn_dispatch_id: 2448 return legalizePreloadedArgIntrin(MI, MRI, B, 2449 AMDGPUFunctionArgInfo::DISPATCH_ID); 2450 case Intrinsic::amdgcn_fdiv_fast: 2451 return legalizeFDIVFastIntrin(MI, MRI, B); 2452 case Intrinsic::amdgcn_is_shared: 2453 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2454 case Intrinsic::amdgcn_is_private: 2455 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2456 case Intrinsic::amdgcn_wavefrontsize: { 2457 B.setInstr(MI); 2458 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2459 MI.eraseFromParent(); 2460 return true; 2461 } 2462 case Intrinsic::amdgcn_raw_buffer_store: 2463 return legalizeRawBufferStore(MI, MRI, B, false); 2464 case Intrinsic::amdgcn_raw_buffer_store_format: 2465 return legalizeRawBufferStore(MI, MRI, B, true); 2466 default: 2467 return true; 2468 } 2469 2470 return true; 2471 } 2472