1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUTargetMachine.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 20 #include "llvm/CodeGen/TargetOpcodes.h" 21 #include "llvm/CodeGen/ValueTypes.h" 22 #include "llvm/IR/DerivedTypes.h" 23 #include "llvm/IR/Type.h" 24 #include "llvm/Support/Debug.h" 25 26 #define DEBUG_TYPE "amdgpu-legalinfo" 27 28 using namespace llvm; 29 using namespace LegalizeActions; 30 using namespace LegalizeMutations; 31 using namespace LegalityPredicates; 32 33 34 static LegalityPredicate isMultiple32(unsigned TypeIdx, 35 unsigned MaxSize = 512) { 36 return [=](const LegalityQuery &Query) { 37 const LLT Ty = Query.Types[TypeIdx]; 38 const LLT EltTy = Ty.getScalarType(); 39 return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 40 }; 41 } 42 43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 44 return [=](const LegalityQuery &Query) { 45 const LLT Ty = Query.Types[TypeIdx]; 46 return Ty.isVector() && 47 Ty.getNumElements() % 2 != 0 && 48 Ty.getElementType().getSizeInBits() < 32; 49 }; 50 } 51 52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 53 return [=](const LegalityQuery &Query) { 54 const LLT Ty = Query.Types[TypeIdx]; 55 const LLT EltTy = Ty.getElementType(); 56 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 57 }; 58 } 59 60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 61 return [=](const LegalityQuery &Query) { 62 const LLT Ty = Query.Types[TypeIdx]; 63 const LLT EltTy = Ty.getElementType(); 64 unsigned Size = Ty.getSizeInBits(); 65 unsigned Pieces = (Size + 63) / 64; 66 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 67 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 68 }; 69 } 70 71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 72 return [=](const LegalityQuery &Query) { 73 const LLT QueryTy = Query.Types[TypeIdx]; 74 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 75 }; 76 } 77 78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 79 return [=](const LegalityQuery &Query) { 80 const LLT QueryTy = Query.Types[TypeIdx]; 81 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 82 }; 83 } 84 85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of 86 // v2s16. 87 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 if (Ty.isVector()) { 91 const int EltSize = Ty.getElementType().getSizeInBits(); 92 return EltSize == 32 || EltSize == 64 || 93 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 94 EltSize == 128 || EltSize == 256; 95 } 96 97 return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512; 98 }; 99 } 100 101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 102 const GCNTargetMachine &TM) 103 : ST(ST_) { 104 using namespace TargetOpcode; 105 106 auto GetAddrSpacePtr = [&TM](unsigned AS) { 107 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 108 }; 109 110 const LLT S1 = LLT::scalar(1); 111 const LLT S8 = LLT::scalar(8); 112 const LLT S16 = LLT::scalar(16); 113 const LLT S32 = LLT::scalar(32); 114 const LLT S64 = LLT::scalar(64); 115 const LLT S128 = LLT::scalar(128); 116 const LLT S256 = LLT::scalar(256); 117 const LLT S512 = LLT::scalar(512); 118 119 const LLT V2S16 = LLT::vector(2, 16); 120 const LLT V4S16 = LLT::vector(4, 16); 121 122 const LLT V2S32 = LLT::vector(2, 32); 123 const LLT V3S32 = LLT::vector(3, 32); 124 const LLT V4S32 = LLT::vector(4, 32); 125 const LLT V5S32 = LLT::vector(5, 32); 126 const LLT V6S32 = LLT::vector(6, 32); 127 const LLT V7S32 = LLT::vector(7, 32); 128 const LLT V8S32 = LLT::vector(8, 32); 129 const LLT V9S32 = LLT::vector(9, 32); 130 const LLT V10S32 = LLT::vector(10, 32); 131 const LLT V11S32 = LLT::vector(11, 32); 132 const LLT V12S32 = LLT::vector(12, 32); 133 const LLT V13S32 = LLT::vector(13, 32); 134 const LLT V14S32 = LLT::vector(14, 32); 135 const LLT V15S32 = LLT::vector(15, 32); 136 const LLT V16S32 = LLT::vector(16, 32); 137 138 const LLT V2S64 = LLT::vector(2, 64); 139 const LLT V3S64 = LLT::vector(3, 64); 140 const LLT V4S64 = LLT::vector(4, 64); 141 const LLT V5S64 = LLT::vector(5, 64); 142 const LLT V6S64 = LLT::vector(6, 64); 143 const LLT V7S64 = LLT::vector(7, 64); 144 const LLT V8S64 = LLT::vector(8, 64); 145 146 std::initializer_list<LLT> AllS32Vectors = 147 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 148 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; 149 std::initializer_list<LLT> AllS64Vectors = 150 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; 151 152 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 153 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 154 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 155 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 156 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 157 158 const LLT CodePtr = FlatPtr; 159 160 const std::initializer_list<LLT> AddrSpaces64 = { 161 GlobalPtr, ConstantPtr, FlatPtr 162 }; 163 164 const std::initializer_list<LLT> AddrSpaces32 = { 165 LocalPtr, PrivatePtr 166 }; 167 168 const std::initializer_list<LLT> FPTypesBase = { 169 S32, S64 170 }; 171 172 const std::initializer_list<LLT> FPTypes16 = { 173 S32, S64, S16 174 }; 175 176 const std::initializer_list<LLT> FPTypesPK16 = { 177 S32, S64, S16, V2S16 178 }; 179 180 setAction({G_BRCOND, S1}, Legal); 181 182 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 183 // elements for v3s16 184 getActionDefinitionsBuilder(G_PHI) 185 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 186 .legalFor(AllS32Vectors) 187 .legalFor(AllS64Vectors) 188 .legalFor(AddrSpaces64) 189 .legalFor(AddrSpaces32) 190 .clampScalar(0, S32, S256) 191 .widenScalarToNextPow2(0, 32) 192 .clampMaxNumElements(0, S32, 16) 193 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 194 .legalIf(isPointer(0)); 195 196 if (ST.has16BitInsts()) { 197 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 198 .legalFor({S32, S16}) 199 .clampScalar(0, S16, S32) 200 .scalarize(0); 201 } else { 202 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 203 .legalFor({S32}) 204 .clampScalar(0, S32, S32) 205 .scalarize(0); 206 } 207 208 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 209 .legalFor({S32}) 210 .clampScalar(0, S32, S32) 211 .scalarize(0); 212 213 // Report legal for any types we can handle anywhere. For the cases only legal 214 // on the SALU, RegBankSelect will be able to re-legalize. 215 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 216 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 217 .clampScalar(0, S32, S64) 218 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 219 .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) 220 .widenScalarToNextPow2(0) 221 .scalarize(0); 222 223 getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, 224 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 225 .legalFor({{S32, S1}}) 226 .clampScalar(0, S32, S32); 227 228 getActionDefinitionsBuilder(G_BITCAST) 229 .legalForCartesianProduct({S32, V2S16}) 230 .legalForCartesianProduct({S64, V2S32, V4S16}) 231 .legalForCartesianProduct({V2S64, V4S32}) 232 // Don't worry about the size constraint. 233 .legalIf(all(isPointer(0), isPointer(1))); 234 235 if (ST.has16BitInsts()) { 236 getActionDefinitionsBuilder(G_FCONSTANT) 237 .legalFor({S32, S64, S16}) 238 .clampScalar(0, S16, S64); 239 } else { 240 getActionDefinitionsBuilder(G_FCONSTANT) 241 .legalFor({S32, S64}) 242 .clampScalar(0, S32, S64); 243 } 244 245 getActionDefinitionsBuilder(G_IMPLICIT_DEF) 246 .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 247 ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 248 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 249 .clampScalarOrElt(0, S32, S512) 250 .legalIf(isMultiple32(0)) 251 .widenScalarToNextPow2(0, 32) 252 .clampMaxNumElements(0, S32, 16); 253 254 255 // FIXME: i1 operands to intrinsics should always be legal, but other i1 256 // values may not be legal. We need to figure out how to distinguish 257 // between these two scenarios. 258 getActionDefinitionsBuilder(G_CONSTANT) 259 .legalFor({S1, S32, S64, GlobalPtr, 260 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 261 .clampScalar(0, S32, S64) 262 .widenScalarToNextPow2(0) 263 .legalIf(isPointer(0)); 264 265 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 266 267 auto &FPOpActions = getActionDefinitionsBuilder( 268 { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE}) 269 .legalFor({S32, S64}); 270 271 if (ST.has16BitInsts()) { 272 if (ST.hasVOP3PInsts()) 273 FPOpActions.legalFor({S16, V2S16}); 274 else 275 FPOpActions.legalFor({S16}); 276 } 277 278 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 279 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 280 281 if (ST.hasVOP3PInsts()) { 282 MinNumMaxNum.customFor(FPTypesPK16) 283 .clampMaxNumElements(0, S16, 2) 284 .clampScalar(0, S16, S64) 285 .scalarize(0); 286 } else if (ST.has16BitInsts()) { 287 MinNumMaxNum.customFor(FPTypes16) 288 .clampScalar(0, S16, S64) 289 .scalarize(0); 290 } else { 291 MinNumMaxNum.customFor(FPTypesBase) 292 .clampScalar(0, S32, S64) 293 .scalarize(0); 294 } 295 296 // TODO: Implement 297 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 298 299 if (ST.hasVOP3PInsts()) 300 FPOpActions.clampMaxNumElements(0, S16, 2); 301 FPOpActions 302 .scalarize(0) 303 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 304 305 if (ST.has16BitInsts()) { 306 getActionDefinitionsBuilder(G_FSQRT) 307 .legalFor({S32, S64, S16}) 308 .scalarize(0) 309 .clampScalar(0, S16, S64); 310 } else { 311 getActionDefinitionsBuilder(G_FSQRT) 312 .legalFor({S32, S64}) 313 .scalarize(0) 314 .clampScalar(0, S32, S64); 315 } 316 317 getActionDefinitionsBuilder(G_FPTRUNC) 318 .legalFor({{S32, S64}, {S16, S32}}) 319 .scalarize(0); 320 321 getActionDefinitionsBuilder(G_FPEXT) 322 .legalFor({{S64, S32}, {S32, S16}}) 323 .lowerFor({{S64, S16}}) // FIXME: Implement 324 .scalarize(0); 325 326 // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 327 getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 328 329 getActionDefinitionsBuilder(G_FSUB) 330 // Use actual fsub instruction 331 .legalFor({S32}) 332 // Must use fadd + fneg 333 .lowerFor({S64, S16, V2S16}) 334 .scalarize(0) 335 .clampScalar(0, S32, S64); 336 337 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 338 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 339 {S32, S1}, {S64, S1}, {S16, S1}, 340 // FIXME: Hack 341 {S64, LLT::scalar(33)}, 342 {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 343 .scalarize(0); 344 345 getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 346 .legalFor({{S32, S32}, {S64, S32}}) 347 .lowerFor({{S32, S64}}) 348 .customFor({{S64, S64}}) 349 .scalarize(0); 350 351 getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 352 .legalFor({{S32, S32}, {S32, S64}}) 353 .scalarize(0); 354 355 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 356 .legalFor({S32, S64}) 357 .scalarize(0); 358 359 if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 360 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 361 .legalFor({S32, S64}) 362 .clampScalar(0, S32, S64) 363 .scalarize(0); 364 } else { 365 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 366 .legalFor({S32}) 367 .customFor({S64}) 368 .clampScalar(0, S32, S64) 369 .scalarize(0); 370 } 371 372 getActionDefinitionsBuilder(G_GEP) 373 .legalForCartesianProduct(AddrSpaces64, {S64}) 374 .legalForCartesianProduct(AddrSpaces32, {S32}) 375 .scalarize(0); 376 377 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 378 379 auto &CmpBuilder = 380 getActionDefinitionsBuilder(G_ICMP) 381 .legalForCartesianProduct( 382 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 383 .legalFor({{S1, S32}, {S1, S64}}); 384 if (ST.has16BitInsts()) { 385 CmpBuilder.legalFor({{S1, S16}}); 386 } 387 388 CmpBuilder 389 .widenScalarToNextPow2(1) 390 .clampScalar(1, S32, S64) 391 .scalarize(0) 392 .legalIf(all(typeIs(0, S1), isPointer(1))); 393 394 getActionDefinitionsBuilder(G_FCMP) 395 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 396 .widenScalarToNextPow2(1) 397 .clampScalar(1, S32, S64) 398 .scalarize(0); 399 400 // FIXME: fexp, flog2, flog10 needs to be custom lowered. 401 getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 402 G_FLOG, G_FLOG2, G_FLOG10}) 403 .legalFor({S32}) 404 .scalarize(0); 405 406 // The 64-bit versions produce 32-bit results, but only on the SALU. 407 getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 408 G_CTTZ, G_CTTZ_ZERO_UNDEF, 409 G_CTPOP}) 410 .legalFor({{S32, S32}, {S32, S64}}) 411 .clampScalar(0, S32, S32) 412 .clampScalar(1, S32, S64) 413 .scalarize(0) 414 .widenScalarToNextPow2(0, 32) 415 .widenScalarToNextPow2(1, 32); 416 417 // TODO: Expand for > s32 418 getActionDefinitionsBuilder(G_BSWAP) 419 .legalFor({S32}) 420 .clampScalar(0, S32, S32) 421 .scalarize(0); 422 423 if (ST.has16BitInsts()) { 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 426 .legalFor({S32, S16, V2S16}) 427 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 428 .clampMaxNumElements(0, S16, 2) 429 .clampScalar(0, S16, S32) 430 .widenScalarToNextPow2(0) 431 .scalarize(0); 432 } else { 433 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 434 .legalFor({S32, S16}) 435 .widenScalarToNextPow2(0) 436 .clampScalar(0, S16, S32) 437 .scalarize(0); 438 } 439 } else { 440 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 441 .legalFor({S32}) 442 .clampScalar(0, S32, S32) 443 .widenScalarToNextPow2(0) 444 .scalarize(0); 445 } 446 447 auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 448 return [=](const LegalityQuery &Query) { 449 return Query.Types[TypeIdx0].getSizeInBits() < 450 Query.Types[TypeIdx1].getSizeInBits(); 451 }; 452 }; 453 454 auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 455 return [=](const LegalityQuery &Query) { 456 return Query.Types[TypeIdx0].getSizeInBits() > 457 Query.Types[TypeIdx1].getSizeInBits(); 458 }; 459 }; 460 461 getActionDefinitionsBuilder(G_INTTOPTR) 462 // List the common cases 463 .legalForCartesianProduct(AddrSpaces64, {S64}) 464 .legalForCartesianProduct(AddrSpaces32, {S32}) 465 .scalarize(0) 466 // Accept any address space as long as the size matches 467 .legalIf(sameSize(0, 1)) 468 .widenScalarIf(smallerThan(1, 0), 469 [](const LegalityQuery &Query) { 470 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 471 }) 472 .narrowScalarIf(greaterThan(1, 0), 473 [](const LegalityQuery &Query) { 474 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 475 }); 476 477 getActionDefinitionsBuilder(G_PTRTOINT) 478 // List the common cases 479 .legalForCartesianProduct(AddrSpaces64, {S64}) 480 .legalForCartesianProduct(AddrSpaces32, {S32}) 481 .scalarize(0) 482 // Accept any address space as long as the size matches 483 .legalIf(sameSize(0, 1)) 484 .widenScalarIf(smallerThan(0, 1), 485 [](const LegalityQuery &Query) { 486 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 487 }) 488 .narrowScalarIf( 489 greaterThan(0, 1), 490 [](const LegalityQuery &Query) { 491 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 492 }); 493 494 if (ST.hasFlatAddressSpace()) { 495 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 496 .scalarize(0) 497 .custom(); 498 } 499 500 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 501 // handle some operations by just promoting the register during 502 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 503 getActionDefinitionsBuilder({G_LOAD, G_STORE}) 504 .narrowScalarIf([](const LegalityQuery &Query) { 505 unsigned Size = Query.Types[0].getSizeInBits(); 506 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 507 return (Size > 32 && MemSize < Size); 508 }, 509 [](const LegalityQuery &Query) { 510 return std::make_pair(0, LLT::scalar(32)); 511 }) 512 .fewerElementsIf([=](const LegalityQuery &Query) { 513 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 514 return (MemSize == 96) && 515 Query.Types[0].isVector() && 516 !ST.hasDwordx3LoadStores(); 517 }, 518 [=](const LegalityQuery &Query) { 519 return std::make_pair(0, V2S32); 520 }) 521 .legalIf([=](const LegalityQuery &Query) { 522 const LLT &Ty0 = Query.Types[0]; 523 524 unsigned Size = Ty0.getSizeInBits(); 525 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 526 if (Size < 32 || (Size > 32 && MemSize < Size)) 527 return false; 528 529 if (Ty0.isVector() && Size != MemSize) 530 return false; 531 532 // TODO: Decompose private loads into 4-byte components. 533 // TODO: Illegal flat loads on SI 534 switch (MemSize) { 535 case 8: 536 case 16: 537 return Size == 32; 538 case 32: 539 case 64: 540 case 128: 541 return true; 542 543 case 96: 544 return ST.hasDwordx3LoadStores(); 545 546 case 256: 547 case 512: 548 // TODO: Possibly support loads of i256 and i512 . This will require 549 // adding i256 and i512 types to MVT in order for to be able to use 550 // TableGen. 551 // TODO: Add support for other vector types, this will require 552 // defining more value mappings for the new types. 553 return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 || 554 Ty0.getScalarType().getSizeInBits() == 64); 555 556 default: 557 return false; 558 } 559 }) 560 .clampScalar(0, S32, S64); 561 562 563 // FIXME: Handle alignment requirements. 564 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 565 .legalForTypesWithMemDesc({ 566 {S32, GlobalPtr, 8, 8}, 567 {S32, GlobalPtr, 16, 8}, 568 {S32, LocalPtr, 8, 8}, 569 {S32, LocalPtr, 16, 8}, 570 {S32, PrivatePtr, 8, 8}, 571 {S32, PrivatePtr, 16, 8}}); 572 if (ST.hasFlatAddressSpace()) { 573 ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8}, 574 {S32, FlatPtr, 16, 8}}); 575 } 576 577 ExtLoads.clampScalar(0, S32, S32) 578 .widenScalarToNextPow2(0) 579 .unsupportedIfMemSizeNotPow2() 580 .lower(); 581 582 auto &Atomics = getActionDefinitionsBuilder( 583 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 584 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 585 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 586 G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 587 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 588 {S64, GlobalPtr}, {S64, LocalPtr}}); 589 if (ST.hasFlatAddressSpace()) { 590 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 591 } 592 593 // TODO: Pointer types, any 32-bit or 64-bit vector 594 getActionDefinitionsBuilder(G_SELECT) 595 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 596 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 597 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 598 .clampScalar(0, S16, S64) 599 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 600 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 601 .scalarize(1) 602 .clampMaxNumElements(0, S32, 2) 603 .clampMaxNumElements(0, LocalPtr, 2) 604 .clampMaxNumElements(0, PrivatePtr, 2) 605 .scalarize(0) 606 .widenScalarToNextPow2(0) 607 .legalIf(all(isPointer(0), typeIs(1, S1))); 608 609 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 610 // be more flexible with the shift amount type. 611 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 612 .legalFor({{S32, S32}, {S64, S32}}); 613 if (ST.has16BitInsts()) { 614 if (ST.hasVOP3PInsts()) { 615 Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 616 .clampMaxNumElements(0, S16, 2); 617 } else 618 Shifts.legalFor({{S16, S32}, {S16, S16}}); 619 620 Shifts.clampScalar(1, S16, S32); 621 Shifts.clampScalar(0, S16, S64); 622 Shifts.widenScalarToNextPow2(0, 16); 623 } else { 624 // Make sure we legalize the shift amount type first, as the general 625 // expansion for the shifted type will produce much worse code if it hasn't 626 // been truncated already. 627 Shifts.clampScalar(1, S32, S32); 628 Shifts.clampScalar(0, S32, S64); 629 Shifts.widenScalarToNextPow2(0, 32); 630 } 631 Shifts.scalarize(0); 632 633 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 634 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 635 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 636 unsigned IdxTypeIdx = 2; 637 638 getActionDefinitionsBuilder(Op) 639 .customIf([=](const LegalityQuery &Query) { 640 const LLT EltTy = Query.Types[EltTypeIdx]; 641 const LLT VecTy = Query.Types[VecTypeIdx]; 642 const LLT IdxTy = Query.Types[IdxTypeIdx]; 643 return (EltTy.getSizeInBits() == 16 || 644 EltTy.getSizeInBits() % 32 == 0) && 645 VecTy.getSizeInBits() % 32 == 0 && 646 VecTy.getSizeInBits() <= 512 && 647 IdxTy.getSizeInBits() == 32; 648 }) 649 .clampScalar(EltTypeIdx, S32, S64) 650 .clampScalar(VecTypeIdx, S32, S64) 651 .clampScalar(IdxTypeIdx, S32, S32); 652 } 653 654 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 655 .unsupportedIf([=](const LegalityQuery &Query) { 656 const LLT &EltTy = Query.Types[1].getElementType(); 657 return Query.Types[0] != EltTy; 658 }); 659 660 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 661 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 662 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 663 664 // FIXME: Doesn't handle extract of illegal sizes. 665 getActionDefinitionsBuilder(Op) 666 .legalIf([=](const LegalityQuery &Query) { 667 const LLT BigTy = Query.Types[BigTyIdx]; 668 const LLT LitTy = Query.Types[LitTyIdx]; 669 return (BigTy.getSizeInBits() % 32 == 0) && 670 (LitTy.getSizeInBits() % 16 == 0); 671 }) 672 .widenScalarIf( 673 [=](const LegalityQuery &Query) { 674 const LLT BigTy = Query.Types[BigTyIdx]; 675 return (BigTy.getScalarSizeInBits() < 16); 676 }, 677 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 678 .widenScalarIf( 679 [=](const LegalityQuery &Query) { 680 const LLT LitTy = Query.Types[LitTyIdx]; 681 return (LitTy.getScalarSizeInBits() < 16); 682 }, 683 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 684 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 685 .widenScalarToNextPow2(BigTyIdx, 32); 686 687 } 688 689 getActionDefinitionsBuilder(G_BUILD_VECTOR) 690 .legalForCartesianProduct(AllS32Vectors, {S32}) 691 .legalForCartesianProduct(AllS64Vectors, {S64}) 692 .clampNumElements(0, V16S32, V16S32) 693 .clampNumElements(0, V2S64, V8S64) 694 .minScalarSameAs(1, 0) 695 .legalIf(isRegisterType(0)) 696 .minScalarOrElt(0, S32); 697 698 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 699 .legalIf(isRegisterType(0)); 700 701 // Merge/Unmerge 702 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 703 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 704 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 705 706 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 707 const LLT &Ty = Query.Types[TypeIdx]; 708 if (Ty.isVector()) { 709 const LLT &EltTy = Ty.getElementType(); 710 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 711 return true; 712 if (!isPowerOf2_32(EltTy.getSizeInBits())) 713 return true; 714 } 715 return false; 716 }; 717 718 getActionDefinitionsBuilder(Op) 719 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 720 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 721 // worth considering the multiples of 64 since 2*192 and 2*384 are not 722 // valid. 723 .clampScalar(LitTyIdx, S16, S256) 724 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 725 726 // Break up vectors with weird elements into scalars 727 .fewerElementsIf( 728 [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 729 scalarize(0)) 730 .fewerElementsIf( 731 [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 732 scalarize(1)) 733 .clampScalar(BigTyIdx, S32, S512) 734 .widenScalarIf( 735 [=](const LegalityQuery &Query) { 736 const LLT &Ty = Query.Types[BigTyIdx]; 737 return !isPowerOf2_32(Ty.getSizeInBits()) && 738 Ty.getSizeInBits() % 16 != 0; 739 }, 740 [=](const LegalityQuery &Query) { 741 // Pick the next power of 2, or a multiple of 64 over 128. 742 // Whichever is smaller. 743 const LLT &Ty = Query.Types[BigTyIdx]; 744 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 745 if (NewSizeInBits >= 256) { 746 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 747 if (RoundedTo < NewSizeInBits) 748 NewSizeInBits = RoundedTo; 749 } 750 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 751 }) 752 .legalIf([=](const LegalityQuery &Query) { 753 const LLT &BigTy = Query.Types[BigTyIdx]; 754 const LLT &LitTy = Query.Types[LitTyIdx]; 755 756 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 757 return false; 758 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 759 return false; 760 761 return BigTy.getSizeInBits() % 16 == 0 && 762 LitTy.getSizeInBits() % 16 == 0 && 763 BigTy.getSizeInBits() <= 512; 764 }) 765 // Any vectors left are the wrong size. Scalarize them. 766 .scalarize(0) 767 .scalarize(1); 768 } 769 770 computeTables(); 771 verify(*ST.getInstrInfo()); 772 } 773 774 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 775 MachineRegisterInfo &MRI, 776 MachineIRBuilder &MIRBuilder, 777 GISelChangeObserver &Observer) const { 778 switch (MI.getOpcode()) { 779 case TargetOpcode::G_ADDRSPACE_CAST: 780 return legalizeAddrSpaceCast(MI, MRI, MIRBuilder); 781 case TargetOpcode::G_FRINT: 782 return legalizeFrint(MI, MRI, MIRBuilder); 783 case TargetOpcode::G_FCEIL: 784 return legalizeFceil(MI, MRI, MIRBuilder); 785 case TargetOpcode::G_INTRINSIC_TRUNC: 786 return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder); 787 case TargetOpcode::G_SITOFP: 788 return legalizeITOFP(MI, MRI, MIRBuilder, true); 789 case TargetOpcode::G_UITOFP: 790 return legalizeITOFP(MI, MRI, MIRBuilder, false); 791 case TargetOpcode::G_FMINNUM: 792 case TargetOpcode::G_FMAXNUM: 793 case TargetOpcode::G_FMINNUM_IEEE: 794 case TargetOpcode::G_FMAXNUM_IEEE: 795 return legalizeMinNumMaxNum(MI, MRI, MIRBuilder); 796 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 797 return legalizeExtractVectorElt(MI, MRI, MIRBuilder); 798 case TargetOpcode::G_INSERT_VECTOR_ELT: 799 return legalizeInsertVectorElt(MI, MRI, MIRBuilder); 800 default: 801 return false; 802 } 803 804 llvm_unreachable("expected switch to return"); 805 } 806 807 Register AMDGPULegalizerInfo::getSegmentAperture( 808 unsigned AS, 809 MachineRegisterInfo &MRI, 810 MachineIRBuilder &MIRBuilder) const { 811 MachineFunction &MF = MIRBuilder.getMF(); 812 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 813 const LLT S32 = LLT::scalar(32); 814 815 if (ST.hasApertureRegs()) { 816 // FIXME: Use inline constants (src_{shared, private}_base) instead of 817 // getreg. 818 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 819 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 820 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 821 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 822 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 823 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 824 unsigned Encoding = 825 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 826 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 827 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 828 829 Register ApertureReg = MRI.createGenericVirtualRegister(S32); 830 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 831 832 MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32) 833 .addDef(GetReg) 834 .addImm(Encoding); 835 MRI.setType(GetReg, S32); 836 837 auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1); 838 MIRBuilder.buildInstr(TargetOpcode::G_SHL) 839 .addDef(ApertureReg) 840 .addUse(GetReg) 841 .addUse(ShiftAmt.getReg(0)); 842 843 return ApertureReg; 844 } 845 846 Register QueuePtr = MRI.createGenericVirtualRegister( 847 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 848 849 // FIXME: Placeholder until we can track the input registers. 850 MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef); 851 852 // Offset into amd_queue_t for group_segment_aperture_base_hi / 853 // private_segment_aperture_base_hi. 854 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 855 856 // FIXME: Don't use undef 857 Value *V = UndefValue::get(PointerType::get( 858 Type::getInt8Ty(MF.getFunction().getContext()), 859 AMDGPUAS::CONSTANT_ADDRESS)); 860 861 MachinePointerInfo PtrInfo(V, StructOffset); 862 MachineMemOperand *MMO = MF.getMachineMemOperand( 863 PtrInfo, 864 MachineMemOperand::MOLoad | 865 MachineMemOperand::MODereferenceable | 866 MachineMemOperand::MOInvariant, 867 4, 868 MinAlign(64, StructOffset)); 869 870 Register LoadResult = MRI.createGenericVirtualRegister(S32); 871 Register LoadAddr; 872 873 MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 874 MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO); 875 return LoadResult; 876 } 877 878 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 879 MachineInstr &MI, MachineRegisterInfo &MRI, 880 MachineIRBuilder &MIRBuilder) const { 881 MachineFunction &MF = MIRBuilder.getMF(); 882 883 MIRBuilder.setInstr(MI); 884 885 Register Dst = MI.getOperand(0).getReg(); 886 Register Src = MI.getOperand(1).getReg(); 887 888 LLT DstTy = MRI.getType(Dst); 889 LLT SrcTy = MRI.getType(Src); 890 unsigned DestAS = DstTy.getAddressSpace(); 891 unsigned SrcAS = SrcTy.getAddressSpace(); 892 893 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 894 // vector element. 895 assert(!DstTy.isVector()); 896 897 const AMDGPUTargetMachine &TM 898 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 899 900 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 901 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 902 MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST)); 903 return true; 904 } 905 906 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 907 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 908 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 909 unsigned NullVal = TM.getNullPointerValue(DestAS); 910 911 auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal); 912 auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0); 913 914 Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 915 916 // Extract low 32-bits of the pointer. 917 MIRBuilder.buildExtract(PtrLo32, Src, 0); 918 919 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 920 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 921 MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 922 923 MI.eraseFromParent(); 924 return true; 925 } 926 927 assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS || 928 SrcAS == AMDGPUAS::PRIVATE_ADDRESS); 929 930 auto SegmentNull = 931 MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 932 auto FlatNull = 933 MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 934 935 Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder); 936 937 Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 938 MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 939 940 Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 941 942 // Coerce the type of the low half of the result so we can use merge_values. 943 Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32)); 944 MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT) 945 .addDef(SrcAsInt) 946 .addUse(Src); 947 948 // TODO: Should we allow mismatched types but matching sizes in merges to 949 // avoid the ptrtoint? 950 MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 951 MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 952 953 MI.eraseFromParent(); 954 return true; 955 } 956 957 bool AMDGPULegalizerInfo::legalizeFrint( 958 MachineInstr &MI, MachineRegisterInfo &MRI, 959 MachineIRBuilder &MIRBuilder) const { 960 MIRBuilder.setInstr(MI); 961 962 Register Src = MI.getOperand(1).getReg(); 963 LLT Ty = MRI.getType(Src); 964 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 965 966 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 967 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 968 969 auto C1 = MIRBuilder.buildFConstant(Ty, C1Val); 970 auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src); 971 972 // TODO: Should this propagate fast-math-flags? 973 auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign); 974 auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign); 975 976 auto C2 = MIRBuilder.buildFConstant(Ty, C2Val); 977 auto Fabs = MIRBuilder.buildFAbs(Ty, Src); 978 979 auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 980 MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 981 return true; 982 } 983 984 bool AMDGPULegalizerInfo::legalizeFceil( 985 MachineInstr &MI, MachineRegisterInfo &MRI, 986 MachineIRBuilder &B) const { 987 B.setInstr(MI); 988 989 const LLT S1 = LLT::scalar(1); 990 const LLT S64 = LLT::scalar(64); 991 992 Register Src = MI.getOperand(1).getReg(); 993 assert(MRI.getType(Src) == S64); 994 995 // result = trunc(src) 996 // if (src > 0.0 && src != result) 997 // result += 1.0 998 999 auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 1000 1001 const auto Zero = B.buildFConstant(S64, 0.0); 1002 const auto One = B.buildFConstant(S64, 1.0); 1003 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1004 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1005 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1006 auto Add = B.buildSelect(S64, And, One, Zero); 1007 1008 // TODO: Should this propagate fast-math-flags? 1009 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1010 return true; 1011 } 1012 1013 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1014 MachineIRBuilder &B) { 1015 const unsigned FractBits = 52; 1016 const unsigned ExpBits = 11; 1017 LLT S32 = LLT::scalar(32); 1018 1019 auto Const0 = B.buildConstant(S32, FractBits - 32); 1020 auto Const1 = B.buildConstant(S32, ExpBits); 1021 1022 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1023 .addUse(Const0.getReg(0)) 1024 .addUse(Const1.getReg(0)); 1025 1026 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1027 } 1028 1029 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1030 MachineInstr &MI, MachineRegisterInfo &MRI, 1031 MachineIRBuilder &B) const { 1032 B.setInstr(MI); 1033 1034 const LLT S1 = LLT::scalar(1); 1035 const LLT S32 = LLT::scalar(32); 1036 const LLT S64 = LLT::scalar(64); 1037 1038 Register Src = MI.getOperand(1).getReg(); 1039 assert(MRI.getType(Src) == S64); 1040 1041 // TODO: Should this use extract since the low half is unused? 1042 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1043 Register Hi = Unmerge.getReg(1); 1044 1045 // Extract the upper half, since this is where we will find the sign and 1046 // exponent. 1047 auto Exp = extractF64Exponent(Hi, B); 1048 1049 const unsigned FractBits = 52; 1050 1051 // Extract the sign bit. 1052 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1053 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1054 1055 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1056 1057 const auto Zero32 = B.buildConstant(S32, 0); 1058 1059 // Extend back to 64-bits. 1060 auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 1061 1062 auto Shr = B.buildAShr(S64, FractMask, Exp); 1063 auto Not = B.buildNot(S64, Shr); 1064 auto Tmp0 = B.buildAnd(S64, Src, Not); 1065 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1066 1067 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1068 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1069 1070 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1071 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1072 return true; 1073 } 1074 1075 bool AMDGPULegalizerInfo::legalizeITOFP( 1076 MachineInstr &MI, MachineRegisterInfo &MRI, 1077 MachineIRBuilder &B, bool Signed) const { 1078 B.setInstr(MI); 1079 1080 Register Dst = MI.getOperand(0).getReg(); 1081 Register Src = MI.getOperand(1).getReg(); 1082 1083 const LLT S64 = LLT::scalar(64); 1084 const LLT S32 = LLT::scalar(32); 1085 1086 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1087 1088 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1089 1090 auto CvtHi = Signed ? 1091 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1092 B.buildUITOFP(S64, Unmerge.getReg(1)); 1093 1094 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1095 1096 auto ThirtyTwo = B.buildConstant(S32, 32); 1097 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1098 .addUse(CvtHi.getReg(0)) 1099 .addUse(ThirtyTwo.getReg(0)); 1100 1101 // TODO: Should this propagate fast-math-flags? 1102 B.buildFAdd(Dst, LdExp, CvtLo); 1103 MI.eraseFromParent(); 1104 return true; 1105 } 1106 1107 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 1108 MachineInstr &MI, MachineRegisterInfo &MRI, 1109 MachineIRBuilder &B) const { 1110 MachineFunction &MF = B.getMF(); 1111 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1112 1113 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1114 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1115 1116 // With ieee_mode disabled, the instructions have the correct behavior 1117 // already for G_FMINNUM/G_FMAXNUM 1118 if (!MFI->getMode().IEEE) 1119 return !IsIEEEOp; 1120 1121 if (IsIEEEOp) 1122 return true; 1123 1124 MachineIRBuilder HelperBuilder(MI); 1125 GISelObserverWrapper DummyObserver; 1126 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1127 HelperBuilder.setMBB(*MI.getParent()); 1128 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1129 } 1130 1131 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1132 MachineInstr &MI, MachineRegisterInfo &MRI, 1133 MachineIRBuilder &B) const { 1134 // TODO: Should move some of this into LegalizerHelper. 1135 1136 // TODO: Promote dynamic indexing of s16 to s32 1137 // TODO: Dynamic s64 indexing is only legal for SGPR. 1138 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 1139 if (!IdxVal) // Dynamic case will be selected to register indexing. 1140 return true; 1141 1142 Register Dst = MI.getOperand(0).getReg(); 1143 Register Vec = MI.getOperand(1).getReg(); 1144 1145 LLT VecTy = MRI.getType(Vec); 1146 LLT EltTy = VecTy.getElementType(); 1147 assert(EltTy == MRI.getType(Dst)); 1148 1149 B.setInstr(MI); 1150 1151 if (IdxVal.getValue() < VecTy.getNumElements()) 1152 B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 1153 else 1154 B.buildUndef(Dst); 1155 1156 MI.eraseFromParent(); 1157 return true; 1158 } 1159 1160 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1161 MachineInstr &MI, MachineRegisterInfo &MRI, 1162 MachineIRBuilder &B) const { 1163 // TODO: Should move some of this into LegalizerHelper. 1164 1165 // TODO: Promote dynamic indexing of s16 to s32 1166 // TODO: Dynamic s64 indexing is only legal for SGPR. 1167 Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 1168 if (!IdxVal) // Dynamic case will be selected to register indexing. 1169 return true; 1170 1171 Register Dst = MI.getOperand(0).getReg(); 1172 Register Vec = MI.getOperand(1).getReg(); 1173 Register Ins = MI.getOperand(2).getReg(); 1174 1175 LLT VecTy = MRI.getType(Vec); 1176 LLT EltTy = VecTy.getElementType(); 1177 assert(EltTy == MRI.getType(Ins)); 1178 1179 B.setInstr(MI); 1180 1181 if (IdxVal.getValue() < VecTy.getNumElements()) 1182 B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 1183 else 1184 B.buildUndef(Dst); 1185 1186 MI.eraseFromParent(); 1187 return true; 1188 } 1189 1190 // Return the use branch instruction, otherwise null if the usage is invalid. 1191 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1192 MachineRegisterInfo &MRI) { 1193 Register CondDef = MI.getOperand(0).getReg(); 1194 if (!MRI.hasOneNonDBGUse(CondDef)) 1195 return nullptr; 1196 1197 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1198 return UseMI.getParent() == MI.getParent() && 1199 UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 1200 } 1201 1202 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 1203 Register Reg, LLT Ty) const { 1204 Register LiveIn = MRI.getLiveInVirtReg(Reg); 1205 if (LiveIn) 1206 return LiveIn; 1207 1208 Register NewReg = MRI.createGenericVirtualRegister(Ty); 1209 MRI.addLiveIn(Reg, NewReg); 1210 return NewReg; 1211 } 1212 1213 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 1214 const ArgDescriptor *Arg) const { 1215 if (!Arg->isRegister()) 1216 return false; // TODO: Handle these 1217 1218 assert(Arg->getRegister() != 0); 1219 assert(Arg->getRegister().isPhysical()); 1220 1221 MachineRegisterInfo &MRI = *B.getMRI(); 1222 1223 LLT Ty = MRI.getType(DstReg); 1224 Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 1225 1226 if (Arg->isMasked()) { 1227 // TODO: Should we try to emit this once in the entry block? 1228 const LLT S32 = LLT::scalar(32); 1229 const unsigned Mask = Arg->getMask(); 1230 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 1231 1232 auto ShiftAmt = B.buildConstant(S32, Shift); 1233 auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt); 1234 B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift)); 1235 } else 1236 B.buildCopy(DstReg, LiveIn); 1237 1238 // Insert the argument copy if it doens't already exist. 1239 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 1240 if (!MRI.getVRegDef(LiveIn)) { 1241 MachineBasicBlock &EntryMBB = B.getMF().front(); 1242 EntryMBB.addLiveIn(Arg->getRegister()); 1243 B.setInsertPt(EntryMBB, EntryMBB.begin()); 1244 B.buildCopy(LiveIn, Arg->getRegister()); 1245 } 1246 1247 return true; 1248 } 1249 1250 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 1251 MachineInstr &MI, 1252 MachineRegisterInfo &MRI, 1253 MachineIRBuilder &B, 1254 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 1255 B.setInstr(MI); 1256 1257 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1258 1259 const ArgDescriptor *Arg; 1260 const TargetRegisterClass *RC; 1261 std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 1262 if (!Arg) { 1263 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 1264 return false; 1265 } 1266 1267 if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 1268 MI.eraseFromParent(); 1269 return true; 1270 } 1271 1272 return false; 1273 } 1274 1275 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 1276 MachineRegisterInfo &MRI, 1277 MachineIRBuilder &B) const { 1278 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 1279 if (!MFI->isEntryFunction()) { 1280 return legalizePreloadedArgIntrin(MI, MRI, B, 1281 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1282 } 1283 1284 B.setInstr(MI); 1285 1286 uint64_t Offset = 1287 ST.getTargetLowering()->getImplicitParameterOffset( 1288 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 1289 Register DstReg = MI.getOperand(0).getReg(); 1290 LLT DstTy = MRI.getType(DstReg); 1291 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 1292 1293 const ArgDescriptor *Arg; 1294 const TargetRegisterClass *RC; 1295 std::tie(Arg, RC) 1296 = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1297 if (!Arg) 1298 return false; 1299 1300 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 1301 if (!loadInputValue(KernargPtrReg, B, Arg)) 1302 return false; 1303 1304 B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 1305 MI.eraseFromParent(); 1306 return true; 1307 } 1308 1309 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 1310 MachineRegisterInfo &MRI, 1311 MachineIRBuilder &B) const { 1312 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 1313 switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) { 1314 case Intrinsic::amdgcn_if: { 1315 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1316 const SIRegisterInfo *TRI 1317 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1318 1319 B.setInstr(*BrCond); 1320 Register Def = MI.getOperand(1).getReg(); 1321 Register Use = MI.getOperand(3).getReg(); 1322 B.buildInstr(AMDGPU::SI_IF) 1323 .addDef(Def) 1324 .addUse(Use) 1325 .addMBB(BrCond->getOperand(1).getMBB()); 1326 1327 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 1328 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 1329 MI.eraseFromParent(); 1330 BrCond->eraseFromParent(); 1331 return true; 1332 } 1333 1334 return false; 1335 } 1336 case Intrinsic::amdgcn_loop: { 1337 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 1338 const SIRegisterInfo *TRI 1339 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 1340 1341 B.setInstr(*BrCond); 1342 Register Reg = MI.getOperand(2).getReg(); 1343 B.buildInstr(AMDGPU::SI_LOOP) 1344 .addUse(Reg) 1345 .addMBB(BrCond->getOperand(1).getMBB()); 1346 MI.eraseFromParent(); 1347 BrCond->eraseFromParent(); 1348 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 1349 return true; 1350 } 1351 1352 return false; 1353 } 1354 case Intrinsic::amdgcn_kernarg_segment_ptr: 1355 return legalizePreloadedArgIntrin( 1356 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 1357 case Intrinsic::amdgcn_implicitarg_ptr: 1358 return legalizeImplicitArgPtr(MI, MRI, B); 1359 case Intrinsic::amdgcn_workitem_id_x: 1360 return legalizePreloadedArgIntrin(MI, MRI, B, 1361 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1362 case Intrinsic::amdgcn_workitem_id_y: 1363 return legalizePreloadedArgIntrin(MI, MRI, B, 1364 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1365 case Intrinsic::amdgcn_workitem_id_z: 1366 return legalizePreloadedArgIntrin(MI, MRI, B, 1367 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1368 case Intrinsic::amdgcn_workgroup_id_x: 1369 return legalizePreloadedArgIntrin(MI, MRI, B, 1370 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 1371 case Intrinsic::amdgcn_workgroup_id_y: 1372 return legalizePreloadedArgIntrin(MI, MRI, B, 1373 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 1374 case Intrinsic::amdgcn_workgroup_id_z: 1375 return legalizePreloadedArgIntrin(MI, MRI, B, 1376 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 1377 case Intrinsic::amdgcn_dispatch_ptr: 1378 return legalizePreloadedArgIntrin(MI, MRI, B, 1379 AMDGPUFunctionArgInfo::DISPATCH_PTR); 1380 case Intrinsic::amdgcn_queue_ptr: 1381 return legalizePreloadedArgIntrin(MI, MRI, B, 1382 AMDGPUFunctionArgInfo::QUEUE_PTR); 1383 case Intrinsic::amdgcn_implicit_buffer_ptr: 1384 return legalizePreloadedArgIntrin( 1385 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 1386 case Intrinsic::amdgcn_dispatch_id: 1387 return legalizePreloadedArgIntrin(MI, MRI, B, 1388 AMDGPUFunctionArgInfo::DISPATCH_ID); 1389 default: 1390 return true; 1391 } 1392 1393 return true; 1394 } 1395