1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "llvm/ADT/ScopeExit.h" 21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/TargetOpcodes.h" 25 #include "llvm/CodeGen/ValueTypes.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Type.h" 29 #include "llvm/Support/Debug.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeNumElements(Pow2NElts); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 return Ty.isVector() && 67 Ty.getNumElements() % 2 != 0 && 68 Ty.getElementType().getSizeInBits() < 32 && 69 Ty.getSizeInBits() % 32 != 0; 70 }; 71 } 72 73 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 74 return [=](const LegalityQuery &Query) { 75 const LLT Ty = Query.Types[TypeIdx]; 76 const LLT EltTy = Ty.getScalarType(); 77 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 78 }; 79 } 80 81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 const LLT EltTy = Ty.getElementType(); 85 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 86 }; 87 } 88 89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 90 return [=](const LegalityQuery &Query) { 91 const LLT Ty = Query.Types[TypeIdx]; 92 const LLT EltTy = Ty.getElementType(); 93 unsigned Size = Ty.getSizeInBits(); 94 unsigned Pieces = (Size + 63) / 64; 95 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 96 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 97 }; 98 } 99 100 // Increase the number of vector elements to reach the next multiple of 32-bit 101 // type. 102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 103 return [=](const LegalityQuery &Query) { 104 const LLT Ty = Query.Types[TypeIdx]; 105 106 const LLT EltTy = Ty.getElementType(); 107 const int Size = Ty.getSizeInBits(); 108 const int EltSize = EltTy.getSizeInBits(); 109 const int NextMul32 = (Size + 31) / 32; 110 111 assert(EltSize < 32); 112 113 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 114 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 115 }; 116 } 117 118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 unsigned Size = Ty.getSizeInBits(); 122 123 LLT CoercedTy; 124 if (Size <= 32) { 125 // <2 x s8> -> s16 126 // <4 x s8> -> s32 127 CoercedTy = LLT::scalar(Size); 128 } else 129 CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130 131 return std::make_pair(TypeIdx, CoercedTy); 132 }; 133 } 134 135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 136 return [=](const LegalityQuery &Query) { 137 const LLT QueryTy = Query.Types[TypeIdx]; 138 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 139 }; 140 } 141 142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 143 return [=](const LegalityQuery &Query) { 144 const LLT QueryTy = Query.Types[TypeIdx]; 145 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 146 }; 147 } 148 149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT QueryTy = Query.Types[TypeIdx]; 152 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 153 }; 154 } 155 156 static bool isRegisterSize(unsigned Size) { 157 return Size % 32 == 0 && Size <= MaxRegisterSize; 158 } 159 160 static bool isRegisterVectorElementType(LLT EltTy) { 161 const int EltSize = EltTy.getSizeInBits(); 162 return EltSize == 16 || EltSize % 32 == 0; 163 } 164 165 static bool isRegisterVectorType(LLT Ty) { 166 const int EltSize = Ty.getElementType().getSizeInBits(); 167 return EltSize == 32 || EltSize == 64 || 168 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 169 EltSize == 128 || EltSize == 256; 170 } 171 172 static bool isRegisterType(LLT Ty) { 173 if (!isRegisterSize(Ty.getSizeInBits())) 174 return false; 175 176 if (Ty.isVector()) 177 return isRegisterVectorType(Ty); 178 179 return true; 180 } 181 182 // Any combination of 32 or 64-bit elements up the maximum register size, and 183 // multiples of v2s16. 184 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185 return [=](const LegalityQuery &Query) { 186 return isRegisterType(Query.Types[TypeIdx]); 187 }; 188 } 189 190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT QueryTy = Query.Types[TypeIdx]; 193 if (!QueryTy.isVector()) 194 return false; 195 const LLT EltTy = QueryTy.getElementType(); 196 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 197 }; 198 } 199 200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 201 return [=](const LegalityQuery &Query) { 202 const LLT Ty = Query.Types[TypeIdx]; 203 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 204 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 205 }; 206 } 207 208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209 // handle some operations by just promoting the register during 210 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212 bool IsLoad) { 213 switch (AS) { 214 case AMDGPUAS::PRIVATE_ADDRESS: 215 // FIXME: Private element size. 216 return 32; 217 case AMDGPUAS::LOCAL_ADDRESS: 218 return ST.useDS128() ? 128 : 64; 219 case AMDGPUAS::GLOBAL_ADDRESS: 220 case AMDGPUAS::CONSTANT_ADDRESS: 221 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222 // Treat constant and global as identical. SMRD loads are sometimes usable for 223 // global loads (ideally constant address space should be eliminated) 224 // depending on the context. Legality cannot be context dependent, but 225 // RegBankSelect can split the load as necessary depending on the pointer 226 // register bank/uniformity and if the memory is invariant or not written in a 227 // kernel. 228 return IsLoad ? 512 : 128; 229 default: 230 // Flat addresses may contextually need to be split to 32-bit parts if they 231 // may alias scratch depending on the subtarget. 232 return 128; 233 } 234 } 235 236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237 const LegalityQuery &Query, 238 unsigned Opcode) { 239 const LLT Ty = Query.Types[0]; 240 241 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242 const bool IsLoad = Opcode != AMDGPU::G_STORE; 243 244 unsigned RegSize = Ty.getSizeInBits(); 245 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246 unsigned Align = Query.MMODescrs[0].AlignInBits; 247 unsigned AS = Query.Types[1].getAddressSpace(); 248 249 // All of these need to be custom lowered to cast the pointer operand. 250 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251 return false; 252 253 // TODO: We should be able to widen loads if the alignment is high enough, but 254 // we also need to modify the memory access size. 255 #if 0 256 // Accept widening loads based on alignment. 257 if (IsLoad && MemSize < Size) 258 MemSize = std::max(MemSize, Align); 259 #endif 260 261 // Only 1-byte and 2-byte to 32-bit extloads are valid. 262 if (MemSize != RegSize && RegSize != 32) 263 return false; 264 265 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266 return false; 267 268 switch (MemSize) { 269 case 8: 270 case 16: 271 case 32: 272 case 64: 273 case 128: 274 break; 275 case 96: 276 if (!ST.hasDwordx3LoadStores()) 277 return false; 278 break; 279 case 256: 280 case 512: 281 // These may contextually need to be broken down. 282 break; 283 default: 284 return false; 285 } 286 287 assert(RegSize >= MemSize); 288 289 if (Align < MemSize) { 290 const SITargetLowering *TLI = ST.getTargetLowering(); 291 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292 return false; 293 } 294 295 return true; 296 } 297 298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299 // workaround this. Eventually it should ignore the type for loads and only care 300 // about the size. Return true in cases where we will workaround this for now by 301 // bitcasting. 302 static bool loadStoreBitcastWorkaround(const LLT Ty) { 303 if (EnableNewLegality) 304 return false; 305 306 const unsigned Size = Ty.getSizeInBits(); 307 if (Size <= 64) 308 return false; 309 if (!Ty.isVector()) 310 return true; 311 unsigned EltSize = Ty.getElementType().getSizeInBits(); 312 return EltSize != 32 && EltSize != 64; 313 } 314 315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316 unsigned Opcode) { 317 const LLT Ty = Query.Types[0]; 318 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319 !loadStoreBitcastWorkaround(Ty); 320 } 321 322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 323 const GCNTargetMachine &TM) 324 : ST(ST_) { 325 using namespace TargetOpcode; 326 327 auto GetAddrSpacePtr = [&TM](unsigned AS) { 328 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 329 }; 330 331 const LLT S1 = LLT::scalar(1); 332 const LLT S16 = LLT::scalar(16); 333 const LLT S32 = LLT::scalar(32); 334 const LLT S64 = LLT::scalar(64); 335 const LLT S128 = LLT::scalar(128); 336 const LLT S256 = LLT::scalar(256); 337 const LLT S512 = LLT::scalar(512); 338 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 339 340 const LLT V2S16 = LLT::vector(2, 16); 341 const LLT V4S16 = LLT::vector(4, 16); 342 343 const LLT V2S32 = LLT::vector(2, 32); 344 const LLT V3S32 = LLT::vector(3, 32); 345 const LLT V4S32 = LLT::vector(4, 32); 346 const LLT V5S32 = LLT::vector(5, 32); 347 const LLT V6S32 = LLT::vector(6, 32); 348 const LLT V7S32 = LLT::vector(7, 32); 349 const LLT V8S32 = LLT::vector(8, 32); 350 const LLT V9S32 = LLT::vector(9, 32); 351 const LLT V10S32 = LLT::vector(10, 32); 352 const LLT V11S32 = LLT::vector(11, 32); 353 const LLT V12S32 = LLT::vector(12, 32); 354 const LLT V13S32 = LLT::vector(13, 32); 355 const LLT V14S32 = LLT::vector(14, 32); 356 const LLT V15S32 = LLT::vector(15, 32); 357 const LLT V16S32 = LLT::vector(16, 32); 358 const LLT V32S32 = LLT::vector(32, 32); 359 360 const LLT V2S64 = LLT::vector(2, 64); 361 const LLT V3S64 = LLT::vector(3, 64); 362 const LLT V4S64 = LLT::vector(4, 64); 363 const LLT V5S64 = LLT::vector(5, 64); 364 const LLT V6S64 = LLT::vector(6, 64); 365 const LLT V7S64 = LLT::vector(7, 64); 366 const LLT V8S64 = LLT::vector(8, 64); 367 const LLT V16S64 = LLT::vector(16, 64); 368 369 std::initializer_list<LLT> AllS32Vectors = 370 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 371 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 372 std::initializer_list<LLT> AllS64Vectors = 373 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 374 375 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 376 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 377 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 378 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 379 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 380 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 381 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 382 383 const LLT CodePtr = FlatPtr; 384 385 const std::initializer_list<LLT> AddrSpaces64 = { 386 GlobalPtr, ConstantPtr, FlatPtr 387 }; 388 389 const std::initializer_list<LLT> AddrSpaces32 = { 390 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 391 }; 392 393 const std::initializer_list<LLT> FPTypesBase = { 394 S32, S64 395 }; 396 397 const std::initializer_list<LLT> FPTypes16 = { 398 S32, S64, S16 399 }; 400 401 const std::initializer_list<LLT> FPTypesPK16 = { 402 S32, S64, S16, V2S16 403 }; 404 405 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406 407 setAction({G_BRCOND, S1}, Legal); // VCC branches 408 setAction({G_BRCOND, S32}, Legal); // SCC branches 409 410 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 411 // elements for v3s16 412 getActionDefinitionsBuilder(G_PHI) 413 .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 414 .legalFor(AllS32Vectors) 415 .legalFor(AllS64Vectors) 416 .legalFor(AddrSpaces64) 417 .legalFor(AddrSpaces32) 418 .clampScalar(0, S32, S256) 419 .widenScalarToNextPow2(0, 32) 420 .clampMaxNumElements(0, S32, 16) 421 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 422 .legalIf(isPointer(0)); 423 424 if (ST.hasVOP3PInsts()) { 425 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426 .legalFor({S32, S16, V2S16}) 427 .clampScalar(0, S16, S32) 428 .clampMaxNumElements(0, S16, 2) 429 .scalarize(0) 430 .widenScalarToNextPow2(0, 32); 431 } else if (ST.has16BitInsts()) { 432 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 433 .legalFor({S32, S16}) 434 .clampScalar(0, S16, S32) 435 .scalarize(0) 436 .widenScalarToNextPow2(0, 32); 437 } else { 438 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 439 .legalFor({S32}) 440 .clampScalar(0, S32, S32) 441 .scalarize(0); 442 } 443 444 // FIXME: Not really legal. Placeholder for custom lowering. 445 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446 .customFor({S32, S64}) 447 .clampScalar(0, S32, S64) 448 .widenScalarToNextPow2(0, 32) 449 .scalarize(0); 450 451 getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 452 .legalFor({S32}) 453 .clampScalar(0, S32, S32) 454 .scalarize(0); 455 456 // Report legal for any types we can handle anywhere. For the cases only legal 457 // on the SALU, RegBankSelect will be able to re-legalize. 458 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 459 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 460 .clampScalar(0, S32, S64) 461 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 462 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 463 .widenScalarToNextPow2(0) 464 .scalarize(0); 465 466 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 467 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468 .legalFor({{S32, S1}, {S32, S32}}) 469 .minScalar(0, S32) 470 // TODO: .scalarize(0) 471 .lower(); 472 473 getActionDefinitionsBuilder(G_BITCAST) 474 // Don't worry about the size constraint. 475 .legalIf(all(isRegisterType(0), isRegisterType(1))) 476 .lower(); 477 478 479 getActionDefinitionsBuilder(G_CONSTANT) 480 .legalFor({S1, S32, S64, S16, GlobalPtr, 481 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 482 .clampScalar(0, S32, S64) 483 .widenScalarToNextPow2(0) 484 .legalIf(isPointer(0)); 485 486 getActionDefinitionsBuilder(G_FCONSTANT) 487 .legalFor({S32, S64, S16}) 488 .clampScalar(0, S16, S64); 489 490 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491 .legalIf(isRegisterType(0)) 492 // s1 and s16 are special cases because they have legal operations on 493 // them, but don't really occupy registers in the normal way. 494 .legalFor({S1, S16}) 495 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496 .clampScalarOrElt(0, S32, MaxScalar) 497 .widenScalarToNextPow2(0, 32) 498 .clampMaxNumElements(0, S32, 16); 499 500 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501 502 // If the amount is divergent, we have to do a wave reduction to get the 503 // maximum value, so this is expanded during RegBankSelect. 504 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505 .legalFor({{PrivatePtr, S32}}); 506 507 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508 .unsupportedFor({PrivatePtr}) 509 .custom(); 510 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 511 512 auto &FPOpActions = getActionDefinitionsBuilder( 513 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 514 .legalFor({S32, S64}); 515 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 516 .customFor({S32, S64}); 517 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 518 .customFor({S32, S64}); 519 520 if (ST.has16BitInsts()) { 521 if (ST.hasVOP3PInsts()) 522 FPOpActions.legalFor({S16, V2S16}); 523 else 524 FPOpActions.legalFor({S16}); 525 526 TrigActions.customFor({S16}); 527 FDIVActions.customFor({S16}); 528 } 529 530 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 531 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 532 533 if (ST.hasVOP3PInsts()) { 534 MinNumMaxNum.customFor(FPTypesPK16) 535 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 536 .clampMaxNumElements(0, S16, 2) 537 .clampScalar(0, S16, S64) 538 .scalarize(0); 539 } else if (ST.has16BitInsts()) { 540 MinNumMaxNum.customFor(FPTypes16) 541 .clampScalar(0, S16, S64) 542 .scalarize(0); 543 } else { 544 MinNumMaxNum.customFor(FPTypesBase) 545 .clampScalar(0, S32, S64) 546 .scalarize(0); 547 } 548 549 if (ST.hasVOP3PInsts()) 550 FPOpActions.clampMaxNumElements(0, S16, 2); 551 552 FPOpActions 553 .scalarize(0) 554 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 555 556 TrigActions 557 .scalarize(0) 558 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 559 560 FDIVActions 561 .scalarize(0) 562 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 563 564 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 565 .legalFor(FPTypesPK16) 566 .clampMaxNumElements(0, S16, 2) 567 .scalarize(0) 568 .clampScalar(0, S16, S64); 569 570 if (ST.has16BitInsts()) { 571 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 572 .legalFor({S32, S64, S16}) 573 .scalarize(0) 574 .clampScalar(0, S16, S64); 575 } else { 576 getActionDefinitionsBuilder(G_FSQRT) 577 .legalFor({S32, S64}) 578 .scalarize(0) 579 .clampScalar(0, S32, S64); 580 581 if (ST.hasFractBug()) { 582 getActionDefinitionsBuilder(G_FFLOOR) 583 .customFor({S64}) 584 .legalFor({S32, S64}) 585 .scalarize(0) 586 .clampScalar(0, S32, S64); 587 } else { 588 getActionDefinitionsBuilder(G_FFLOOR) 589 .legalFor({S32, S64}) 590 .scalarize(0) 591 .clampScalar(0, S32, S64); 592 } 593 } 594 595 getActionDefinitionsBuilder(G_FPTRUNC) 596 .legalFor({{S32, S64}, {S16, S32}}) 597 .scalarize(0) 598 .lower(); 599 600 getActionDefinitionsBuilder(G_FPEXT) 601 .legalFor({{S64, S32}, {S32, S16}}) 602 .lowerFor({{S64, S16}}) // FIXME: Implement 603 .scalarize(0); 604 605 getActionDefinitionsBuilder(G_FSUB) 606 // Use actual fsub instruction 607 .legalFor({S32}) 608 // Must use fadd + fneg 609 .lowerFor({S64, S16, V2S16}) 610 .scalarize(0) 611 .clampScalar(0, S32, S64); 612 613 // Whether this is legal depends on the floating point mode for the function. 614 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 616 FMad.customFor({S32, S16}); 617 else if (ST.hasMadMacF32Insts()) 618 FMad.customFor({S32}); 619 else if (ST.hasMadF16()) 620 FMad.customFor({S16}); 621 FMad.scalarize(0) 622 .lower(); 623 624 // TODO: Do we need to clamp maximum bitwidth? 625 getActionDefinitionsBuilder(G_TRUNC) 626 .legalIf(isScalar(0)) 627 .legalFor({{V2S16, V2S32}}) 628 .clampMaxNumElements(0, S16, 2) 629 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630 // situations (like an invalid implicit use), we don't want to infinite loop 631 // in the legalizer. 632 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633 .alwaysLegal(); 634 635 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 636 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637 {S32, S1}, {S64, S1}, {S16, S1}}) 638 .scalarize(0) 639 .clampScalar(0, S32, S64) 640 .widenScalarToNextPow2(1, 32); 641 642 // TODO: Split s1->s64 during regbankselect for VALU. 643 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 645 .lowerFor({{S32, S64}}) 646 .lowerIf(typeIs(1, S1)) 647 .customFor({{S64, S64}}); 648 if (ST.has16BitInsts()) 649 IToFP.legalFor({{S16, S16}}); 650 IToFP.clampScalar(1, S32, S64) 651 .scalarize(0) 652 .widenScalarToNextPow2(1); 653 654 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656 .customFor({{S64, S64}}); 657 if (ST.has16BitInsts()) 658 FPToI.legalFor({{S16, S16}}); 659 else 660 FPToI.minScalar(1, S32); 661 662 FPToI.minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 667 .scalarize(0) 668 .lower(); 669 670 if (ST.has16BitInsts()) { 671 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 672 .legalFor({S16, S32, S64}) 673 .clampScalar(0, S16, S64) 674 .scalarize(0); 675 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 676 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 677 .legalFor({S32, S64}) 678 .clampScalar(0, S32, S64) 679 .scalarize(0); 680 } else { 681 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 682 .legalFor({S32}) 683 .customFor({S64}) 684 .clampScalar(0, S32, S64) 685 .scalarize(0); 686 } 687 688 // FIXME: Clamp offset operand. 689 getActionDefinitionsBuilder(G_PTR_ADD) 690 .legalIf(isPointer(0)) 691 .scalarize(0); 692 693 getActionDefinitionsBuilder(G_PTRMASK) 694 .legalIf(typeInSet(1, {S64, S32})) 695 .minScalar(1, S32) 696 .maxScalarIf(sizeIs(0, 32), 1, S32) 697 .maxScalarIf(sizeIs(0, 64), 1, S64) 698 .scalarize(0); 699 700 auto &CmpBuilder = 701 getActionDefinitionsBuilder(G_ICMP) 702 // The compare output type differs based on the register bank of the output, 703 // so make both s1 and s32 legal. 704 // 705 // Scalar compares producing output in scc will be promoted to s32, as that 706 // is the allocatable register type that will be needed for the copy from 707 // scc. This will be promoted during RegBankSelect, and we assume something 708 // before that won't try to use s32 result types. 709 // 710 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 711 // bank. 712 .legalForCartesianProduct( 713 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 714 .legalForCartesianProduct( 715 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 716 if (ST.has16BitInsts()) { 717 CmpBuilder.legalFor({{S1, S16}}); 718 } 719 720 CmpBuilder 721 .widenScalarToNextPow2(1) 722 .clampScalar(1, S32, S64) 723 .scalarize(0) 724 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 725 726 getActionDefinitionsBuilder(G_FCMP) 727 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 728 .widenScalarToNextPow2(1) 729 .clampScalar(1, S32, S64) 730 .scalarize(0); 731 732 // FIXME: fpow has a selection pattern that should move to custom lowering. 733 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 734 if (ST.has16BitInsts()) 735 Exp2Ops.legalFor({S32, S16}); 736 else 737 Exp2Ops.legalFor({S32}); 738 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 739 Exp2Ops.scalarize(0); 740 741 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 742 if (ST.has16BitInsts()) 743 ExpOps.customFor({{S32}, {S16}}); 744 else 745 ExpOps.customFor({S32}); 746 ExpOps.clampScalar(0, MinScalarFPTy, S32) 747 .scalarize(0); 748 749 // The 64-bit versions produce 32-bit results, but only on the SALU. 750 getActionDefinitionsBuilder(G_CTPOP) 751 .legalFor({{S32, S32}, {S32, S64}}) 752 .clampScalar(0, S32, S32) 753 .clampScalar(1, S32, S64) 754 .scalarize(0) 755 .widenScalarToNextPow2(0, 32) 756 .widenScalarToNextPow2(1, 32); 757 758 // The hardware instructions return a different result on 0 than the generic 759 // instructions expect. The hardware produces -1, but these produce the 760 // bitwidth. 761 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 762 .scalarize(0) 763 .clampScalar(0, S32, S32) 764 .clampScalar(1, S32, S64) 765 .widenScalarToNextPow2(0, 32) 766 .widenScalarToNextPow2(1, 32) 767 .lower(); 768 769 // The 64-bit versions produce 32-bit results, but only on the SALU. 770 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 771 .legalFor({{S32, S32}, {S32, S64}}) 772 .clampScalar(0, S32, S32) 773 .clampScalar(1, S32, S64) 774 .scalarize(0) 775 .widenScalarToNextPow2(0, 32) 776 .widenScalarToNextPow2(1, 32); 777 778 getActionDefinitionsBuilder(G_BITREVERSE) 779 .legalFor({S32}) 780 .clampScalar(0, S32, S32) 781 .scalarize(0); 782 783 if (ST.has16BitInsts()) { 784 getActionDefinitionsBuilder(G_BSWAP) 785 .legalFor({S16, S32, V2S16}) 786 .clampMaxNumElements(0, S16, 2) 787 // FIXME: Fixing non-power-of-2 before clamp is workaround for 788 // narrowScalar limitation. 789 .widenScalarToNextPow2(0) 790 .clampScalar(0, S16, S32) 791 .scalarize(0); 792 793 if (ST.hasVOP3PInsts()) { 794 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 795 .legalFor({S32, S16, V2S16}) 796 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 797 .clampMaxNumElements(0, S16, 2) 798 .minScalar(0, S16) 799 .widenScalarToNextPow2(0) 800 .scalarize(0) 801 .lower(); 802 } else { 803 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 804 .legalFor({S32, S16}) 805 .widenScalarToNextPow2(0) 806 .minScalar(0, S16) 807 .scalarize(0) 808 .lower(); 809 } 810 } else { 811 // TODO: Should have same legality without v_perm_b32 812 getActionDefinitionsBuilder(G_BSWAP) 813 .legalFor({S32}) 814 .lowerIf(scalarNarrowerThan(0, 32)) 815 // FIXME: Fixing non-power-of-2 before clamp is workaround for 816 // narrowScalar limitation. 817 .widenScalarToNextPow2(0) 818 .maxScalar(0, S32) 819 .scalarize(0) 820 .lower(); 821 822 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 823 .legalFor({S32}) 824 .minScalar(0, S32) 825 .widenScalarToNextPow2(0) 826 .scalarize(0) 827 .lower(); 828 } 829 830 getActionDefinitionsBuilder(G_INTTOPTR) 831 // List the common cases 832 .legalForCartesianProduct(AddrSpaces64, {S64}) 833 .legalForCartesianProduct(AddrSpaces32, {S32}) 834 .scalarize(0) 835 // Accept any address space as long as the size matches 836 .legalIf(sameSize(0, 1)) 837 .widenScalarIf(smallerThan(1, 0), 838 [](const LegalityQuery &Query) { 839 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 840 }) 841 .narrowScalarIf(largerThan(1, 0), 842 [](const LegalityQuery &Query) { 843 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 844 }); 845 846 getActionDefinitionsBuilder(G_PTRTOINT) 847 // List the common cases 848 .legalForCartesianProduct(AddrSpaces64, {S64}) 849 .legalForCartesianProduct(AddrSpaces32, {S32}) 850 .scalarize(0) 851 // Accept any address space as long as the size matches 852 .legalIf(sameSize(0, 1)) 853 .widenScalarIf(smallerThan(0, 1), 854 [](const LegalityQuery &Query) { 855 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 856 }) 857 .narrowScalarIf( 858 largerThan(0, 1), 859 [](const LegalityQuery &Query) { 860 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 861 }); 862 863 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 864 .scalarize(0) 865 .custom(); 866 867 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 868 bool IsLoad) -> bool { 869 const LLT DstTy = Query.Types[0]; 870 871 // Split vector extloads. 872 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 873 unsigned Align = Query.MMODescrs[0].AlignInBits; 874 875 if (MemSize < DstTy.getSizeInBits()) 876 MemSize = std::max(MemSize, Align); 877 878 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 879 return true; 880 881 const LLT PtrTy = Query.Types[1]; 882 unsigned AS = PtrTy.getAddressSpace(); 883 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 884 return true; 885 886 // Catch weird sized loads that don't evenly divide into the access sizes 887 // TODO: May be able to widen depending on alignment etc. 888 unsigned NumRegs = (MemSize + 31) / 32; 889 if (NumRegs == 3) { 890 if (!ST.hasDwordx3LoadStores()) 891 return true; 892 } else { 893 // If the alignment allows, these should have been widened. 894 if (!isPowerOf2_32(NumRegs)) 895 return true; 896 } 897 898 if (Align < MemSize) { 899 const SITargetLowering *TLI = ST.getTargetLowering(); 900 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 901 } 902 903 return false; 904 }; 905 906 const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 907 unsigned Opc) -> bool { 908 unsigned Size = Query.Types[0].getSizeInBits(); 909 if (isPowerOf2_32(Size)) 910 return false; 911 912 if (Size == 96 && ST.hasDwordx3LoadStores()) 913 return false; 914 915 unsigned AddrSpace = Query.Types[1].getAddressSpace(); 916 if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 917 return false; 918 919 unsigned Align = Query.MMODescrs[0].AlignInBits; 920 unsigned RoundedSize = NextPowerOf2(Size); 921 return (Align >= RoundedSize); 922 }; 923 924 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 925 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 926 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 927 928 // TODO: Refine based on subtargets which support unaligned access or 128-bit 929 // LDS 930 // TODO: Unsupported flat for SI. 931 932 for (unsigned Op : {G_LOAD, G_STORE}) { 933 const bool IsStore = Op == G_STORE; 934 935 auto &Actions = getActionDefinitionsBuilder(Op); 936 // Explicitly list some common cases. 937 // TODO: Does this help compile time at all? 938 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 939 {V2S32, GlobalPtr, 64, GlobalAlign32}, 940 {V4S32, GlobalPtr, 128, GlobalAlign32}, 941 {S64, GlobalPtr, 64, GlobalAlign32}, 942 {V2S64, GlobalPtr, 128, GlobalAlign32}, 943 {V2S16, GlobalPtr, 32, GlobalAlign32}, 944 {S32, GlobalPtr, 8, GlobalAlign8}, 945 {S32, GlobalPtr, 16, GlobalAlign16}, 946 947 {S32, LocalPtr, 32, 32}, 948 {S64, LocalPtr, 64, 32}, 949 {V2S32, LocalPtr, 64, 32}, 950 {S32, LocalPtr, 8, 8}, 951 {S32, LocalPtr, 16, 16}, 952 {V2S16, LocalPtr, 32, 32}, 953 954 {S32, PrivatePtr, 32, 32}, 955 {S32, PrivatePtr, 8, 8}, 956 {S32, PrivatePtr, 16, 16}, 957 {V2S16, PrivatePtr, 32, 32}, 958 959 {S32, ConstantPtr, 32, GlobalAlign32}, 960 {V2S32, ConstantPtr, 64, GlobalAlign32}, 961 {V4S32, ConstantPtr, 128, GlobalAlign32}, 962 {S64, ConstantPtr, 64, GlobalAlign32}, 963 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 964 Actions.legalIf( 965 [=](const LegalityQuery &Query) -> bool { 966 return isLoadStoreLegal(ST, Query, Op); 967 }); 968 969 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 970 // 64-bits. 971 // 972 // TODO: Should generalize bitcast action into coerce, which will also cover 973 // inserting addrspacecasts. 974 Actions.customIf(typeIs(1, Constant32Ptr)); 975 976 // Turn any illegal element vectors into something easier to deal 977 // with. These will ultimately produce 32-bit scalar shifts to extract the 978 // parts anyway. 979 // 980 // For odd 16-bit element vectors, prefer to split those into pieces with 981 // 16-bit vector parts. 982 Actions.bitcastIf( 983 [=](const LegalityQuery &Query) -> bool { 984 const LLT Ty = Query.Types[0]; 985 const unsigned Size = Ty.getSizeInBits(); 986 987 if (Size != Query.MMODescrs[0].SizeInBits) 988 return Size <= 32 && Ty.isVector(); 989 990 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 991 return true; 992 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 993 !isRegisterVectorElementType(Ty.getElementType()); 994 }, bitcastToRegisterType(0)); 995 996 Actions 997 .customIf(typeIs(1, Constant32Ptr)) 998 // Widen suitably aligned loads by loading extra elements. 999 .moreElementsIf([=](const LegalityQuery &Query) { 1000 const LLT Ty = Query.Types[0]; 1001 return Op == G_LOAD && Ty.isVector() && 1002 shouldWidenLoadResult(Query, Op); 1003 }, moreElementsToNextPow2(0)) 1004 .widenScalarIf([=](const LegalityQuery &Query) { 1005 const LLT Ty = Query.Types[0]; 1006 return Op == G_LOAD && !Ty.isVector() && 1007 shouldWidenLoadResult(Query, Op); 1008 }, widenScalarOrEltToNextPow2(0)) 1009 .narrowScalarIf( 1010 [=](const LegalityQuery &Query) -> bool { 1011 return !Query.Types[0].isVector() && 1012 needToSplitMemOp(Query, Op == G_LOAD); 1013 }, 1014 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1015 const LLT DstTy = Query.Types[0]; 1016 const LLT PtrTy = Query.Types[1]; 1017 1018 const unsigned DstSize = DstTy.getSizeInBits(); 1019 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1020 1021 // Split extloads. 1022 if (DstSize > MemSize) 1023 return std::make_pair(0, LLT::scalar(MemSize)); 1024 1025 if (!isPowerOf2_32(DstSize)) { 1026 // We're probably decomposing an odd sized store. Try to split 1027 // to the widest type. TODO: Account for alignment. As-is it 1028 // should be OK, since the new parts will be further legalized. 1029 unsigned FloorSize = PowerOf2Floor(DstSize); 1030 return std::make_pair(0, LLT::scalar(FloorSize)); 1031 } 1032 1033 if (DstSize > 32 && (DstSize % 32 != 0)) { 1034 // FIXME: Need a way to specify non-extload of larger size if 1035 // suitably aligned. 1036 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1037 } 1038 1039 unsigned MaxSize = maxSizeForAddrSpace(ST, 1040 PtrTy.getAddressSpace(), 1041 Op == G_LOAD); 1042 if (MemSize > MaxSize) 1043 return std::make_pair(0, LLT::scalar(MaxSize)); 1044 1045 unsigned Align = Query.MMODescrs[0].AlignInBits; 1046 return std::make_pair(0, LLT::scalar(Align)); 1047 }) 1048 .fewerElementsIf( 1049 [=](const LegalityQuery &Query) -> bool { 1050 return Query.Types[0].isVector() && 1051 needToSplitMemOp(Query, Op == G_LOAD); 1052 }, 1053 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1054 const LLT DstTy = Query.Types[0]; 1055 const LLT PtrTy = Query.Types[1]; 1056 1057 LLT EltTy = DstTy.getElementType(); 1058 unsigned MaxSize = maxSizeForAddrSpace(ST, 1059 PtrTy.getAddressSpace(), 1060 Op == G_LOAD); 1061 1062 // FIXME: Handle widened to power of 2 results better. This ends 1063 // up scalarizing. 1064 // FIXME: 3 element stores scalarized on SI 1065 1066 // Split if it's too large for the address space. 1067 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1068 unsigned NumElts = DstTy.getNumElements(); 1069 unsigned EltSize = EltTy.getSizeInBits(); 1070 1071 if (MaxSize % EltSize == 0) { 1072 return std::make_pair( 1073 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1074 } 1075 1076 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1077 1078 // FIXME: Refine when odd breakdowns handled 1079 // The scalars will need to be re-legalized. 1080 if (NumPieces == 1 || NumPieces >= NumElts || 1081 NumElts % NumPieces != 0) 1082 return std::make_pair(0, EltTy); 1083 1084 return std::make_pair(0, 1085 LLT::vector(NumElts / NumPieces, EltTy)); 1086 } 1087 1088 // FIXME: We could probably handle weird extending loads better. 1089 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1090 if (DstTy.getSizeInBits() > MemSize) 1091 return std::make_pair(0, EltTy); 1092 1093 unsigned EltSize = EltTy.getSizeInBits(); 1094 unsigned DstSize = DstTy.getSizeInBits(); 1095 if (!isPowerOf2_32(DstSize)) { 1096 // We're probably decomposing an odd sized store. Try to split 1097 // to the widest type. TODO: Account for alignment. As-is it 1098 // should be OK, since the new parts will be further legalized. 1099 unsigned FloorSize = PowerOf2Floor(DstSize); 1100 return std::make_pair( 1101 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1102 } 1103 1104 // Need to split because of alignment. 1105 unsigned Align = Query.MMODescrs[0].AlignInBits; 1106 if (EltSize > Align && 1107 (EltSize / Align < DstTy.getNumElements())) { 1108 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1109 } 1110 1111 // May need relegalization for the scalars. 1112 return std::make_pair(0, EltTy); 1113 }) 1114 .minScalar(0, S32); 1115 1116 if (IsStore) 1117 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1118 1119 // TODO: Need a bitcast lower option? 1120 Actions 1121 .widenScalarToNextPow2(0) 1122 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 1123 } 1124 1125 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1126 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1127 {S32, GlobalPtr, 16, 2 * 8}, 1128 {S32, LocalPtr, 8, 8}, 1129 {S32, LocalPtr, 16, 16}, 1130 {S32, PrivatePtr, 8, 8}, 1131 {S32, PrivatePtr, 16, 16}, 1132 {S32, ConstantPtr, 8, 8}, 1133 {S32, ConstantPtr, 16, 2 * 8}}); 1134 if (ST.hasFlatAddressSpace()) { 1135 ExtLoads.legalForTypesWithMemDesc( 1136 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1137 } 1138 1139 ExtLoads.clampScalar(0, S32, S32) 1140 .widenScalarToNextPow2(0) 1141 .unsupportedIfMemSizeNotPow2() 1142 .lower(); 1143 1144 auto &Atomics = getActionDefinitionsBuilder( 1145 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1146 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1147 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1148 G_ATOMICRMW_UMIN}) 1149 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1150 {S64, GlobalPtr}, {S64, LocalPtr}}); 1151 if (ST.hasFlatAddressSpace()) { 1152 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1153 } 1154 1155 if (ST.hasLDSFPAtomics()) { 1156 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1157 .legalFor({{S32, LocalPtr}}); 1158 } 1159 1160 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1161 // demarshalling 1162 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1163 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1164 {S32, FlatPtr}, {S64, FlatPtr}}) 1165 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1166 {S32, RegionPtr}, {S64, RegionPtr}}); 1167 // TODO: Pointer types, any 32-bit or 64-bit vector 1168 1169 // Condition should be s32 for scalar, s1 for vector. 1170 getActionDefinitionsBuilder(G_SELECT) 1171 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1172 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1173 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1174 .clampScalar(0, S16, S64) 1175 .scalarize(1) 1176 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1177 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1178 .clampMaxNumElements(0, S32, 2) 1179 .clampMaxNumElements(0, LocalPtr, 2) 1180 .clampMaxNumElements(0, PrivatePtr, 2) 1181 .scalarize(0) 1182 .widenScalarToNextPow2(0) 1183 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1184 1185 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1186 // be more flexible with the shift amount type. 1187 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1188 .legalFor({{S32, S32}, {S64, S32}}); 1189 if (ST.has16BitInsts()) { 1190 if (ST.hasVOP3PInsts()) { 1191 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1192 .clampMaxNumElements(0, S16, 2); 1193 } else 1194 Shifts.legalFor({{S16, S16}}); 1195 1196 // TODO: Support 16-bit shift amounts for all types 1197 Shifts.widenScalarIf( 1198 [=](const LegalityQuery &Query) { 1199 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1200 // 32-bit amount. 1201 const LLT ValTy = Query.Types[0]; 1202 const LLT AmountTy = Query.Types[1]; 1203 return ValTy.getSizeInBits() <= 16 && 1204 AmountTy.getSizeInBits() < 16; 1205 }, changeTo(1, S16)); 1206 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1207 Shifts.clampScalar(1, S32, S32); 1208 Shifts.clampScalar(0, S16, S64); 1209 Shifts.widenScalarToNextPow2(0, 16); 1210 } else { 1211 // Make sure we legalize the shift amount type first, as the general 1212 // expansion for the shifted type will produce much worse code if it hasn't 1213 // been truncated already. 1214 Shifts.clampScalar(1, S32, S32); 1215 Shifts.clampScalar(0, S32, S64); 1216 Shifts.widenScalarToNextPow2(0, 32); 1217 } 1218 Shifts.scalarize(0); 1219 1220 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1221 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1222 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1223 unsigned IdxTypeIdx = 2; 1224 1225 getActionDefinitionsBuilder(Op) 1226 .customIf([=](const LegalityQuery &Query) { 1227 const LLT EltTy = Query.Types[EltTypeIdx]; 1228 const LLT VecTy = Query.Types[VecTypeIdx]; 1229 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1230 return (EltTy.getSizeInBits() == 16 || 1231 EltTy.getSizeInBits() % 32 == 0) && 1232 VecTy.getSizeInBits() % 32 == 0 && 1233 VecTy.getSizeInBits() <= MaxRegisterSize && 1234 IdxTy.getSizeInBits() == 32; 1235 }) 1236 .clampScalar(EltTypeIdx, S32, S64) 1237 .clampScalar(VecTypeIdx, S32, S64) 1238 .clampScalar(IdxTypeIdx, S32, S32); 1239 } 1240 1241 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1242 .unsupportedIf([=](const LegalityQuery &Query) { 1243 const LLT &EltTy = Query.Types[1].getElementType(); 1244 return Query.Types[0] != EltTy; 1245 }); 1246 1247 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1248 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1249 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1250 1251 // FIXME: Doesn't handle extract of illegal sizes. 1252 getActionDefinitionsBuilder(Op) 1253 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1254 // FIXME: Multiples of 16 should not be legal. 1255 .legalIf([=](const LegalityQuery &Query) { 1256 const LLT BigTy = Query.Types[BigTyIdx]; 1257 const LLT LitTy = Query.Types[LitTyIdx]; 1258 return (BigTy.getSizeInBits() % 32 == 0) && 1259 (LitTy.getSizeInBits() % 16 == 0); 1260 }) 1261 .widenScalarIf( 1262 [=](const LegalityQuery &Query) { 1263 const LLT BigTy = Query.Types[BigTyIdx]; 1264 return (BigTy.getScalarSizeInBits() < 16); 1265 }, 1266 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1267 .widenScalarIf( 1268 [=](const LegalityQuery &Query) { 1269 const LLT LitTy = Query.Types[LitTyIdx]; 1270 return (LitTy.getScalarSizeInBits() < 16); 1271 }, 1272 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1273 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1274 .widenScalarToNextPow2(BigTyIdx, 32); 1275 1276 } 1277 1278 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1279 .legalForCartesianProduct(AllS32Vectors, {S32}) 1280 .legalForCartesianProduct(AllS64Vectors, {S64}) 1281 .clampNumElements(0, V16S32, V32S32) 1282 .clampNumElements(0, V2S64, V16S64) 1283 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1284 1285 if (ST.hasScalarPackInsts()) { 1286 BuildVector 1287 // FIXME: Should probably widen s1 vectors straight to s32 1288 .minScalarOrElt(0, S16) 1289 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1290 .minScalar(1, S32); 1291 1292 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1293 .legalFor({V2S16, S32}) 1294 .lower(); 1295 BuildVector.minScalarOrElt(0, S32); 1296 } else { 1297 BuildVector.customFor({V2S16, S16}); 1298 BuildVector.minScalarOrElt(0, S32); 1299 1300 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1301 .customFor({V2S16, S32}) 1302 .lower(); 1303 } 1304 1305 BuildVector.legalIf(isRegisterType(0)); 1306 1307 // FIXME: Clamp maximum size 1308 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1309 .legalIf(isRegisterType(0)); 1310 1311 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1312 // pre-legalize. 1313 if (ST.hasVOP3PInsts()) { 1314 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1315 .customFor({V2S16, V2S16}) 1316 .lower(); 1317 } else 1318 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1319 1320 // Merge/Unmerge 1321 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1322 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1323 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1324 1325 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1326 const LLT Ty = Query.Types[TypeIdx]; 1327 if (Ty.isVector()) { 1328 const LLT &EltTy = Ty.getElementType(); 1329 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1330 return true; 1331 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1332 return true; 1333 } 1334 return false; 1335 }; 1336 1337 auto &Builder = getActionDefinitionsBuilder(Op) 1338 .lowerFor({{S16, V2S16}}) 1339 .lowerIf([=](const LegalityQuery &Query) { 1340 const LLT BigTy = Query.Types[BigTyIdx]; 1341 return BigTy.getSizeInBits() == 32; 1342 }) 1343 // Try to widen to s16 first for small types. 1344 // TODO: Only do this on targets with legal s16 shifts 1345 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1346 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1347 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1348 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1349 elementTypeIs(1, S16)), 1350 changeTo(1, V2S16)) 1351 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1352 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1353 // valid. 1354 .clampScalar(LitTyIdx, S32, S512) 1355 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1356 // Break up vectors with weird elements into scalars 1357 .fewerElementsIf( 1358 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1359 scalarize(0)) 1360 .fewerElementsIf( 1361 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1362 scalarize(1)) 1363 .clampScalar(BigTyIdx, S32, MaxScalar); 1364 1365 if (Op == G_MERGE_VALUES) { 1366 Builder.widenScalarIf( 1367 // TODO: Use 16-bit shifts if legal for 8-bit values? 1368 [=](const LegalityQuery &Query) { 1369 const LLT Ty = Query.Types[LitTyIdx]; 1370 return Ty.getSizeInBits() < 32; 1371 }, 1372 changeTo(LitTyIdx, S32)); 1373 } 1374 1375 Builder.widenScalarIf( 1376 [=](const LegalityQuery &Query) { 1377 const LLT Ty = Query.Types[BigTyIdx]; 1378 return !isPowerOf2_32(Ty.getSizeInBits()) && 1379 Ty.getSizeInBits() % 16 != 0; 1380 }, 1381 [=](const LegalityQuery &Query) { 1382 // Pick the next power of 2, or a multiple of 64 over 128. 1383 // Whichever is smaller. 1384 const LLT &Ty = Query.Types[BigTyIdx]; 1385 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1386 if (NewSizeInBits >= 256) { 1387 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1388 if (RoundedTo < NewSizeInBits) 1389 NewSizeInBits = RoundedTo; 1390 } 1391 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1392 }) 1393 .legalIf([=](const LegalityQuery &Query) { 1394 const LLT &BigTy = Query.Types[BigTyIdx]; 1395 const LLT &LitTy = Query.Types[LitTyIdx]; 1396 1397 if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 1398 return false; 1399 if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 1400 return false; 1401 1402 return BigTy.getSizeInBits() % 16 == 0 && 1403 LitTy.getSizeInBits() % 16 == 0 && 1404 BigTy.getSizeInBits() <= MaxRegisterSize; 1405 }) 1406 // Any vectors left are the wrong size. Scalarize them. 1407 .scalarize(0) 1408 .scalarize(1); 1409 } 1410 1411 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1412 // RegBankSelect. 1413 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1414 .legalFor({{S32}, {S64}}); 1415 1416 if (ST.hasVOP3PInsts()) { 1417 SextInReg.lowerFor({{V2S16}}) 1418 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1419 // get more vector shift opportunities, since we'll get those when 1420 // expanded. 1421 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1422 } else if (ST.has16BitInsts()) { 1423 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1424 } else { 1425 // Prefer to promote to s32 before lowering if we don't have 16-bit 1426 // shifts. This avoid a lot of intermediate truncate and extend operations. 1427 SextInReg.lowerFor({{S32}, {S64}}); 1428 } 1429 1430 // FIXME: Placeholder rule. Really depends on whether the clamp modifier is 1431 // available, and is selectively legal for s16, s32, v2s16. 1432 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) 1433 .scalarize(0) 1434 .clampScalar(0, S16, S32); 1435 1436 SextInReg 1437 .scalarize(0) 1438 .clampScalar(0, S32, S64) 1439 .lower(); 1440 1441 getActionDefinitionsBuilder(G_FSHR) 1442 .legalFor({{S32, S32}}) 1443 .scalarize(0) 1444 .lower(); 1445 1446 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1447 .legalFor({S64}); 1448 1449 getActionDefinitionsBuilder({ 1450 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1451 G_FCOPYSIGN, 1452 1453 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1454 G_READ_REGISTER, 1455 G_WRITE_REGISTER, 1456 1457 G_SADDO, G_SSUBO, 1458 1459 // TODO: Implement 1460 G_FMINIMUM, G_FMAXIMUM, 1461 G_FSHL 1462 }).lower(); 1463 1464 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1465 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1466 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1467 .unsupported(); 1468 1469 computeTables(); 1470 verify(*ST.getInstrInfo()); 1471 } 1472 1473 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1474 MachineInstr &MI) const { 1475 MachineIRBuilder &B = Helper.MIRBuilder; 1476 MachineRegisterInfo &MRI = *B.getMRI(); 1477 GISelChangeObserver &Observer = Helper.Observer; 1478 1479 switch (MI.getOpcode()) { 1480 case TargetOpcode::G_ADDRSPACE_CAST: 1481 return legalizeAddrSpaceCast(MI, MRI, B); 1482 case TargetOpcode::G_FRINT: 1483 return legalizeFrint(MI, MRI, B); 1484 case TargetOpcode::G_FCEIL: 1485 return legalizeFceil(MI, MRI, B); 1486 case TargetOpcode::G_INTRINSIC_TRUNC: 1487 return legalizeIntrinsicTrunc(MI, MRI, B); 1488 case TargetOpcode::G_SITOFP: 1489 return legalizeITOFP(MI, MRI, B, true); 1490 case TargetOpcode::G_UITOFP: 1491 return legalizeITOFP(MI, MRI, B, false); 1492 case TargetOpcode::G_FPTOSI: 1493 return legalizeFPTOI(MI, MRI, B, true); 1494 case TargetOpcode::G_FPTOUI: 1495 return legalizeFPTOI(MI, MRI, B, false); 1496 case TargetOpcode::G_FMINNUM: 1497 case TargetOpcode::G_FMAXNUM: 1498 case TargetOpcode::G_FMINNUM_IEEE: 1499 case TargetOpcode::G_FMAXNUM_IEEE: 1500 return legalizeMinNumMaxNum(Helper, MI); 1501 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1502 return legalizeExtractVectorElt(MI, MRI, B); 1503 case TargetOpcode::G_INSERT_VECTOR_ELT: 1504 return legalizeInsertVectorElt(MI, MRI, B); 1505 case TargetOpcode::G_SHUFFLE_VECTOR: 1506 return legalizeShuffleVector(MI, MRI, B); 1507 case TargetOpcode::G_FSIN: 1508 case TargetOpcode::G_FCOS: 1509 return legalizeSinCos(MI, MRI, B); 1510 case TargetOpcode::G_GLOBAL_VALUE: 1511 return legalizeGlobalValue(MI, MRI, B); 1512 case TargetOpcode::G_LOAD: 1513 return legalizeLoad(MI, MRI, B, Observer); 1514 case TargetOpcode::G_FMAD: 1515 return legalizeFMad(MI, MRI, B); 1516 case TargetOpcode::G_FDIV: 1517 return legalizeFDIV(MI, MRI, B); 1518 case TargetOpcode::G_UDIV: 1519 case TargetOpcode::G_UREM: 1520 return legalizeUDIV_UREM(MI, MRI, B); 1521 case TargetOpcode::G_SDIV: 1522 case TargetOpcode::G_SREM: 1523 return legalizeSDIV_SREM(MI, MRI, B); 1524 case TargetOpcode::G_ATOMIC_CMPXCHG: 1525 return legalizeAtomicCmpXChg(MI, MRI, B); 1526 case TargetOpcode::G_FLOG: 1527 return legalizeFlog(MI, B, numbers::ln2f); 1528 case TargetOpcode::G_FLOG10: 1529 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1530 case TargetOpcode::G_FEXP: 1531 return legalizeFExp(MI, B); 1532 case TargetOpcode::G_FPOW: 1533 return legalizeFPow(MI, B); 1534 case TargetOpcode::G_FFLOOR: 1535 return legalizeFFloor(MI, MRI, B); 1536 case TargetOpcode::G_BUILD_VECTOR: 1537 return legalizeBuildVector(MI, MRI, B); 1538 default: 1539 return false; 1540 } 1541 1542 llvm_unreachable("expected switch to return"); 1543 } 1544 1545 Register AMDGPULegalizerInfo::getSegmentAperture( 1546 unsigned AS, 1547 MachineRegisterInfo &MRI, 1548 MachineIRBuilder &B) const { 1549 MachineFunction &MF = B.getMF(); 1550 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1551 const LLT S32 = LLT::scalar(32); 1552 1553 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1554 1555 if (ST.hasApertureRegs()) { 1556 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1557 // getreg. 1558 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1559 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1560 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1561 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1562 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1563 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1564 unsigned Encoding = 1565 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1566 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1567 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1568 1569 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1570 1571 B.buildInstr(AMDGPU::S_GETREG_B32) 1572 .addDef(GetReg) 1573 .addImm(Encoding); 1574 MRI.setType(GetReg, S32); 1575 1576 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1577 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1578 } 1579 1580 Register QueuePtr = MRI.createGenericVirtualRegister( 1581 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1582 1583 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1584 if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1585 return Register(); 1586 1587 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1588 // private_segment_aperture_base_hi. 1589 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1590 1591 // TODO: can we be smarter about machine pointer info? 1592 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1593 MachineMemOperand *MMO = MF.getMachineMemOperand( 1594 PtrInfo, 1595 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1596 MachineMemOperand::MOInvariant, 1597 4, commonAlignment(Align(64), StructOffset)); 1598 1599 Register LoadAddr; 1600 1601 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1602 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1603 } 1604 1605 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1606 MachineInstr &MI, MachineRegisterInfo &MRI, 1607 MachineIRBuilder &B) const { 1608 MachineFunction &MF = B.getMF(); 1609 1610 const LLT S32 = LLT::scalar(32); 1611 Register Dst = MI.getOperand(0).getReg(); 1612 Register Src = MI.getOperand(1).getReg(); 1613 1614 LLT DstTy = MRI.getType(Dst); 1615 LLT SrcTy = MRI.getType(Src); 1616 unsigned DestAS = DstTy.getAddressSpace(); 1617 unsigned SrcAS = SrcTy.getAddressSpace(); 1618 1619 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1620 // vector element. 1621 assert(!DstTy.isVector()); 1622 1623 const AMDGPUTargetMachine &TM 1624 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1625 1626 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1627 if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1628 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1629 return true; 1630 } 1631 1632 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1633 // Truncate. 1634 B.buildExtract(Dst, Src, 0); 1635 MI.eraseFromParent(); 1636 return true; 1637 } 1638 1639 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1641 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1642 1643 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1644 // another. Merge operands are required to be the same type, but creating an 1645 // extra ptrtoint would be kind of pointless. 1646 auto HighAddr = B.buildConstant( 1647 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1648 B.buildMerge(Dst, {Src, HighAddr}); 1649 MI.eraseFromParent(); 1650 return true; 1651 } 1652 1653 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1654 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1655 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1656 unsigned NullVal = TM.getNullPointerValue(DestAS); 1657 1658 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1659 auto FlatNull = B.buildConstant(SrcTy, 0); 1660 1661 // Extract low 32-bits of the pointer. 1662 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1663 1664 auto CmpRes = 1665 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1666 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1667 1668 MI.eraseFromParent(); 1669 return true; 1670 } 1671 1672 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1673 return false; 1674 1675 if (!ST.hasFlatAddressSpace()) 1676 return false; 1677 1678 auto SegmentNull = 1679 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1680 auto FlatNull = 1681 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1682 1683 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1684 if (!ApertureReg.isValid()) 1685 return false; 1686 1687 auto CmpRes = 1688 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1689 1690 // Coerce the type of the low half of the result so we can use merge_values. 1691 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1692 1693 // TODO: Should we allow mismatched types but matching sizes in merges to 1694 // avoid the ptrtoint? 1695 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1696 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1697 1698 MI.eraseFromParent(); 1699 return true; 1700 } 1701 1702 bool AMDGPULegalizerInfo::legalizeFrint( 1703 MachineInstr &MI, MachineRegisterInfo &MRI, 1704 MachineIRBuilder &B) const { 1705 Register Src = MI.getOperand(1).getReg(); 1706 LLT Ty = MRI.getType(Src); 1707 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1708 1709 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1710 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1711 1712 auto C1 = B.buildFConstant(Ty, C1Val); 1713 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1714 1715 // TODO: Should this propagate fast-math-flags? 1716 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1717 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1718 1719 auto C2 = B.buildFConstant(Ty, C2Val); 1720 auto Fabs = B.buildFAbs(Ty, Src); 1721 1722 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1723 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1724 return true; 1725 } 1726 1727 bool AMDGPULegalizerInfo::legalizeFceil( 1728 MachineInstr &MI, MachineRegisterInfo &MRI, 1729 MachineIRBuilder &B) const { 1730 1731 const LLT S1 = LLT::scalar(1); 1732 const LLT S64 = LLT::scalar(64); 1733 1734 Register Src = MI.getOperand(1).getReg(); 1735 assert(MRI.getType(Src) == S64); 1736 1737 // result = trunc(src) 1738 // if (src > 0.0 && src != result) 1739 // result += 1.0 1740 1741 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1742 1743 const auto Zero = B.buildFConstant(S64, 0.0); 1744 const auto One = B.buildFConstant(S64, 1.0); 1745 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1746 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1747 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1748 auto Add = B.buildSelect(S64, And, One, Zero); 1749 1750 // TODO: Should this propagate fast-math-flags? 1751 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1752 return true; 1753 } 1754 1755 static MachineInstrBuilder extractF64Exponent(unsigned Hi, 1756 MachineIRBuilder &B) { 1757 const unsigned FractBits = 52; 1758 const unsigned ExpBits = 11; 1759 LLT S32 = LLT::scalar(32); 1760 1761 auto Const0 = B.buildConstant(S32, FractBits - 32); 1762 auto Const1 = B.buildConstant(S32, ExpBits); 1763 1764 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1765 .addUse(Const0.getReg(0)) 1766 .addUse(Const1.getReg(0)); 1767 1768 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1769 } 1770 1771 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1772 MachineInstr &MI, MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B) const { 1774 const LLT S1 = LLT::scalar(1); 1775 const LLT S32 = LLT::scalar(32); 1776 const LLT S64 = LLT::scalar(64); 1777 1778 Register Src = MI.getOperand(1).getReg(); 1779 assert(MRI.getType(Src) == S64); 1780 1781 // TODO: Should this use extract since the low half is unused? 1782 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1783 Register Hi = Unmerge.getReg(1); 1784 1785 // Extract the upper half, since this is where we will find the sign and 1786 // exponent. 1787 auto Exp = extractF64Exponent(Hi, B); 1788 1789 const unsigned FractBits = 52; 1790 1791 // Extract the sign bit. 1792 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1793 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1794 1795 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1796 1797 const auto Zero32 = B.buildConstant(S32, 0); 1798 1799 // Extend back to 64-bits. 1800 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1801 1802 auto Shr = B.buildAShr(S64, FractMask, Exp); 1803 auto Not = B.buildNot(S64, Shr); 1804 auto Tmp0 = B.buildAnd(S64, Src, Not); 1805 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1806 1807 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1808 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1809 1810 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1811 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1812 return true; 1813 } 1814 1815 bool AMDGPULegalizerInfo::legalizeITOFP( 1816 MachineInstr &MI, MachineRegisterInfo &MRI, 1817 MachineIRBuilder &B, bool Signed) const { 1818 1819 Register Dst = MI.getOperand(0).getReg(); 1820 Register Src = MI.getOperand(1).getReg(); 1821 1822 const LLT S64 = LLT::scalar(64); 1823 const LLT S32 = LLT::scalar(32); 1824 1825 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1826 1827 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1828 1829 auto CvtHi = Signed ? 1830 B.buildSITOFP(S64, Unmerge.getReg(1)) : 1831 B.buildUITOFP(S64, Unmerge.getReg(1)); 1832 1833 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 1834 1835 auto ThirtyTwo = B.buildConstant(S32, 32); 1836 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 1837 .addUse(CvtHi.getReg(0)) 1838 .addUse(ThirtyTwo.getReg(0)); 1839 1840 // TODO: Should this propagate fast-math-flags? 1841 B.buildFAdd(Dst, LdExp, CvtLo); 1842 MI.eraseFromParent(); 1843 return true; 1844 } 1845 1846 // TODO: Copied from DAG implementation. Verify logic and document how this 1847 // actually works. 1848 bool AMDGPULegalizerInfo::legalizeFPTOI( 1849 MachineInstr &MI, MachineRegisterInfo &MRI, 1850 MachineIRBuilder &B, bool Signed) const { 1851 1852 Register Dst = MI.getOperand(0).getReg(); 1853 Register Src = MI.getOperand(1).getReg(); 1854 1855 const LLT S64 = LLT::scalar(64); 1856 const LLT S32 = LLT::scalar(32); 1857 1858 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1859 1860 unsigned Flags = MI.getFlags(); 1861 1862 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1863 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1864 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1865 1866 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1867 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1868 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1869 1870 auto Hi = Signed ? 1871 B.buildFPTOSI(S32, FloorMul) : 1872 B.buildFPTOUI(S32, FloorMul); 1873 auto Lo = B.buildFPTOUI(S32, Fma); 1874 1875 B.buildMerge(Dst, { Lo, Hi }); 1876 MI.eraseFromParent(); 1877 1878 return true; 1879 } 1880 1881 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1882 MachineInstr &MI) const { 1883 MachineFunction &MF = Helper.MIRBuilder.getMF(); 1884 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1885 1886 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 1887 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 1888 1889 // With ieee_mode disabled, the instructions have the correct behavior 1890 // already for G_FMINNUM/G_FMAXNUM 1891 if (!MFI->getMode().IEEE) 1892 return !IsIEEEOp; 1893 1894 if (IsIEEEOp) 1895 return true; 1896 1897 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 1898 } 1899 1900 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 1901 MachineInstr &MI, MachineRegisterInfo &MRI, 1902 MachineIRBuilder &B) const { 1903 // TODO: Should move some of this into LegalizerHelper. 1904 1905 // TODO: Promote dynamic indexing of s16 to s32 1906 1907 // FIXME: Artifact combiner probably should have replaced the truncated 1908 // constant before this, so we shouldn't need 1909 // getConstantVRegValWithLookThrough. 1910 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1911 MI.getOperand(2).getReg(), MRI); 1912 if (!IdxVal) // Dynamic case will be selected to register indexing. 1913 return true; 1914 1915 Register Dst = MI.getOperand(0).getReg(); 1916 Register Vec = MI.getOperand(1).getReg(); 1917 1918 LLT VecTy = MRI.getType(Vec); 1919 LLT EltTy = VecTy.getElementType(); 1920 assert(EltTy == MRI.getType(Dst)); 1921 1922 if (IdxVal->Value < VecTy.getNumElements()) 1923 B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 1924 else 1925 B.buildUndef(Dst); 1926 1927 MI.eraseFromParent(); 1928 return true; 1929 } 1930 1931 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 1932 MachineInstr &MI, MachineRegisterInfo &MRI, 1933 MachineIRBuilder &B) const { 1934 // TODO: Should move some of this into LegalizerHelper. 1935 1936 // TODO: Promote dynamic indexing of s16 to s32 1937 1938 // FIXME: Artifact combiner probably should have replaced the truncated 1939 // constant before this, so we shouldn't need 1940 // getConstantVRegValWithLookThrough. 1941 Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1942 MI.getOperand(3).getReg(), MRI); 1943 if (!IdxVal) // Dynamic case will be selected to register indexing. 1944 return true; 1945 1946 Register Dst = MI.getOperand(0).getReg(); 1947 Register Vec = MI.getOperand(1).getReg(); 1948 Register Ins = MI.getOperand(2).getReg(); 1949 1950 LLT VecTy = MRI.getType(Vec); 1951 LLT EltTy = VecTy.getElementType(); 1952 assert(EltTy == MRI.getType(Ins)); 1953 1954 if (IdxVal->Value < VecTy.getNumElements()) 1955 B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 1956 else 1957 B.buildUndef(Dst); 1958 1959 MI.eraseFromParent(); 1960 return true; 1961 } 1962 1963 bool AMDGPULegalizerInfo::legalizeShuffleVector( 1964 MachineInstr &MI, MachineRegisterInfo &MRI, 1965 MachineIRBuilder &B) const { 1966 const LLT V2S16 = LLT::vector(2, 16); 1967 1968 Register Dst = MI.getOperand(0).getReg(); 1969 Register Src0 = MI.getOperand(1).getReg(); 1970 LLT DstTy = MRI.getType(Dst); 1971 LLT SrcTy = MRI.getType(Src0); 1972 1973 if (SrcTy == V2S16 && DstTy == V2S16 && 1974 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1975 return true; 1976 1977 MachineIRBuilder HelperBuilder(MI); 1978 GISelObserverWrapper DummyObserver; 1979 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1980 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1981 } 1982 1983 bool AMDGPULegalizerInfo::legalizeSinCos( 1984 MachineInstr &MI, MachineRegisterInfo &MRI, 1985 MachineIRBuilder &B) const { 1986 1987 Register DstReg = MI.getOperand(0).getReg(); 1988 Register SrcReg = MI.getOperand(1).getReg(); 1989 LLT Ty = MRI.getType(DstReg); 1990 unsigned Flags = MI.getFlags(); 1991 1992 Register TrigVal; 1993 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 1994 if (ST.hasTrigReducedRange()) { 1995 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1996 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1997 .addUse(MulVal.getReg(0)) 1998 .setMIFlags(Flags).getReg(0); 1999 } else 2000 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2001 2002 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2003 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2004 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2005 .addUse(TrigVal) 2006 .setMIFlags(Flags); 2007 MI.eraseFromParent(); 2008 return true; 2009 } 2010 2011 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2012 MachineIRBuilder &B, 2013 const GlobalValue *GV, 2014 int64_t Offset, 2015 unsigned GAFlags) const { 2016 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2017 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2018 // to the following code sequence: 2019 // 2020 // For constant address space: 2021 // s_getpc_b64 s[0:1] 2022 // s_add_u32 s0, s0, $symbol 2023 // s_addc_u32 s1, s1, 0 2024 // 2025 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2026 // a fixup or relocation is emitted to replace $symbol with a literal 2027 // constant, which is a pc-relative offset from the encoding of the $symbol 2028 // operand to the global variable. 2029 // 2030 // For global address space: 2031 // s_getpc_b64 s[0:1] 2032 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2033 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2034 // 2035 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2036 // fixups or relocations are emitted to replace $symbol@*@lo and 2037 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2038 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2039 // operand to the global variable. 2040 // 2041 // What we want here is an offset from the value returned by s_getpc 2042 // (which is the address of the s_add_u32 instruction) to the global 2043 // variable, but since the encoding of $symbol starts 4 bytes after the start 2044 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2045 // small. This requires us to add 4 to the global variable offset in order to 2046 // compute the correct address. 2047 2048 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2049 2050 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2051 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2052 2053 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2054 .addDef(PCReg); 2055 2056 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2057 if (GAFlags == SIInstrInfo::MO_NONE) 2058 MIB.addImm(0); 2059 else 2060 MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 2061 2062 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2063 2064 if (PtrTy.getSizeInBits() == 32) 2065 B.buildExtract(DstReg, PCReg, 0); 2066 return true; 2067 } 2068 2069 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2070 MachineInstr &MI, MachineRegisterInfo &MRI, 2071 MachineIRBuilder &B) const { 2072 Register DstReg = MI.getOperand(0).getReg(); 2073 LLT Ty = MRI.getType(DstReg); 2074 unsigned AS = Ty.getAddressSpace(); 2075 2076 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2077 MachineFunction &MF = B.getMF(); 2078 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2079 2080 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2081 if (!MFI->isEntryFunction()) { 2082 const Function &Fn = MF.getFunction(); 2083 DiagnosticInfoUnsupported BadLDSDecl( 2084 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2085 DS_Warning); 2086 Fn.getContext().diagnose(BadLDSDecl); 2087 2088 // We currently don't have a way to correctly allocate LDS objects that 2089 // aren't directly associated with a kernel. We do force inlining of 2090 // functions that use local objects. However, if these dead functions are 2091 // not eliminated, we don't want a compile time error. Just emit a warning 2092 // and a trap, since there should be no callable path here. 2093 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2094 B.buildUndef(DstReg); 2095 MI.eraseFromParent(); 2096 return true; 2097 } 2098 2099 // TODO: We could emit code to handle the initialization somewhere. 2100 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2101 const SITargetLowering *TLI = ST.getTargetLowering(); 2102 if (!TLI->shouldUseLDSConstAddress(GV)) { 2103 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2104 return true; // Leave in place; 2105 } 2106 2107 B.buildConstant( 2108 DstReg, 2109 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2110 MI.eraseFromParent(); 2111 return true; 2112 } 2113 2114 const Function &Fn = MF.getFunction(); 2115 DiagnosticInfoUnsupported BadInit( 2116 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2117 Fn.getContext().diagnose(BadInit); 2118 return true; 2119 } 2120 2121 const SITargetLowering *TLI = ST.getTargetLowering(); 2122 2123 if (TLI->shouldEmitFixup(GV)) { 2124 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2125 MI.eraseFromParent(); 2126 return true; 2127 } 2128 2129 if (TLI->shouldEmitPCReloc(GV)) { 2130 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2131 MI.eraseFromParent(); 2132 return true; 2133 } 2134 2135 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2136 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2137 2138 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2139 MachinePointerInfo::getGOT(MF), 2140 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2141 MachineMemOperand::MOInvariant, 2142 8 /*Size*/, Align(8)); 2143 2144 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2145 2146 if (Ty.getSizeInBits() == 32) { 2147 // Truncate if this is a 32-bit constant adrdess. 2148 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2149 B.buildExtract(DstReg, Load, 0); 2150 } else 2151 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2152 2153 MI.eraseFromParent(); 2154 return true; 2155 } 2156 2157 bool AMDGPULegalizerInfo::legalizeLoad( 2158 MachineInstr &MI, MachineRegisterInfo &MRI, 2159 MachineIRBuilder &B, GISelChangeObserver &Observer) const { 2160 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2161 auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 2162 Observer.changingInstr(MI); 2163 MI.getOperand(1).setReg(Cast.getReg(0)); 2164 Observer.changedInstr(MI); 2165 return true; 2166 } 2167 2168 bool AMDGPULegalizerInfo::legalizeFMad( 2169 MachineInstr &MI, MachineRegisterInfo &MRI, 2170 MachineIRBuilder &B) const { 2171 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2172 assert(Ty.isScalar()); 2173 2174 MachineFunction &MF = B.getMF(); 2175 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2176 2177 // TODO: Always legal with future ftz flag. 2178 // FIXME: Do we need just output? 2179 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2180 return true; 2181 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2182 return true; 2183 2184 MachineIRBuilder HelperBuilder(MI); 2185 GISelObserverWrapper DummyObserver; 2186 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2187 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2188 } 2189 2190 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2191 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2192 Register DstReg = MI.getOperand(0).getReg(); 2193 Register PtrReg = MI.getOperand(1).getReg(); 2194 Register CmpVal = MI.getOperand(2).getReg(); 2195 Register NewVal = MI.getOperand(3).getReg(); 2196 2197 assert(SITargetLowering::isFlatGlobalAddrSpace( 2198 MRI.getType(PtrReg).getAddressSpace()) && 2199 "this should not have been custom lowered"); 2200 2201 LLT ValTy = MRI.getType(CmpVal); 2202 LLT VecTy = LLT::vector(2, ValTy); 2203 2204 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2205 2206 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2207 .addDef(DstReg) 2208 .addUse(PtrReg) 2209 .addUse(PackedVal) 2210 .setMemRefs(MI.memoperands()); 2211 2212 MI.eraseFromParent(); 2213 return true; 2214 } 2215 2216 bool AMDGPULegalizerInfo::legalizeFlog( 2217 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2218 Register Dst = MI.getOperand(0).getReg(); 2219 Register Src = MI.getOperand(1).getReg(); 2220 LLT Ty = B.getMRI()->getType(Dst); 2221 unsigned Flags = MI.getFlags(); 2222 2223 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2224 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2225 2226 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2227 MI.eraseFromParent(); 2228 return true; 2229 } 2230 2231 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2232 MachineIRBuilder &B) const { 2233 Register Dst = MI.getOperand(0).getReg(); 2234 Register Src = MI.getOperand(1).getReg(); 2235 unsigned Flags = MI.getFlags(); 2236 LLT Ty = B.getMRI()->getType(Dst); 2237 2238 auto K = B.buildFConstant(Ty, numbers::log2e); 2239 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2240 B.buildFExp2(Dst, Mul, Flags); 2241 MI.eraseFromParent(); 2242 return true; 2243 } 2244 2245 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2246 MachineIRBuilder &B) const { 2247 Register Dst = MI.getOperand(0).getReg(); 2248 Register Src0 = MI.getOperand(1).getReg(); 2249 Register Src1 = MI.getOperand(2).getReg(); 2250 unsigned Flags = MI.getFlags(); 2251 LLT Ty = B.getMRI()->getType(Dst); 2252 const LLT S16 = LLT::scalar(16); 2253 const LLT S32 = LLT::scalar(32); 2254 2255 if (Ty == S32) { 2256 auto Log = B.buildFLog2(S32, Src0, Flags); 2257 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2258 .addUse(Log.getReg(0)) 2259 .addUse(Src1) 2260 .setMIFlags(Flags); 2261 B.buildFExp2(Dst, Mul, Flags); 2262 } else if (Ty == S16) { 2263 // There's no f16 fmul_legacy, so we need to convert for it. 2264 auto Log = B.buildFLog2(S16, Src0, Flags); 2265 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2266 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2267 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2268 .addUse(Ext0.getReg(0)) 2269 .addUse(Ext1.getReg(0)) 2270 .setMIFlags(Flags); 2271 2272 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2273 } else 2274 return false; 2275 2276 MI.eraseFromParent(); 2277 return true; 2278 } 2279 2280 // Find a source register, ignoring any possible source modifiers. 2281 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2282 Register ModSrc = OrigSrc; 2283 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2284 ModSrc = SrcFNeg->getOperand(1).getReg(); 2285 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2286 ModSrc = SrcFAbs->getOperand(1).getReg(); 2287 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2288 ModSrc = SrcFAbs->getOperand(1).getReg(); 2289 return ModSrc; 2290 } 2291 2292 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2293 MachineRegisterInfo &MRI, 2294 MachineIRBuilder &B) const { 2295 2296 const LLT S1 = LLT::scalar(1); 2297 const LLT S64 = LLT::scalar(64); 2298 Register Dst = MI.getOperand(0).getReg(); 2299 Register OrigSrc = MI.getOperand(1).getReg(); 2300 unsigned Flags = MI.getFlags(); 2301 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2302 "this should not have been custom lowered"); 2303 2304 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2305 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2306 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2307 // V_FRACT bug is: 2308 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2309 // 2310 // Convert floor(x) to (x - fract(x)) 2311 2312 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2313 .addUse(OrigSrc) 2314 .setMIFlags(Flags); 2315 2316 // Give source modifier matching some assistance before obscuring a foldable 2317 // pattern. 2318 2319 // TODO: We can avoid the neg on the fract? The input sign to fract 2320 // shouldn't matter? 2321 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2322 2323 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2324 2325 Register Min = MRI.createGenericVirtualRegister(S64); 2326 2327 // We don't need to concern ourselves with the snan handling difference, so 2328 // use the one which will directly select. 2329 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2330 if (MFI->getMode().IEEE) 2331 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2332 else 2333 B.buildFMinNum(Min, Fract, Const, Flags); 2334 2335 Register CorrectedFract = Min; 2336 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2337 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2338 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2339 } 2340 2341 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2342 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2343 2344 MI.eraseFromParent(); 2345 return true; 2346 } 2347 2348 // Turn an illegal packed v2s16 build vector into bit operations. 2349 // TODO: This should probably be a bitcast action in LegalizerHelper. 2350 bool AMDGPULegalizerInfo::legalizeBuildVector( 2351 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2352 Register Dst = MI.getOperand(0).getReg(); 2353 const LLT S32 = LLT::scalar(32); 2354 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2355 2356 Register Src0 = MI.getOperand(1).getReg(); 2357 Register Src1 = MI.getOperand(2).getReg(); 2358 assert(MRI.getType(Src0) == LLT::scalar(16)); 2359 2360 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2361 B.buildBitcast(Dst, Merge); 2362 2363 MI.eraseFromParent(); 2364 return true; 2365 } 2366 2367 // Return the use branch instruction, otherwise null if the usage is invalid. 2368 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2369 MachineRegisterInfo &MRI, 2370 MachineInstr *&Br, 2371 MachineBasicBlock *&UncondBrTarget) { 2372 Register CondDef = MI.getOperand(0).getReg(); 2373 if (!MRI.hasOneNonDBGUse(CondDef)) 2374 return nullptr; 2375 2376 MachineBasicBlock *Parent = MI.getParent(); 2377 MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2378 if (UseMI.getParent() != Parent || 2379 UseMI.getOpcode() != AMDGPU::G_BRCOND) 2380 return nullptr; 2381 2382 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2383 MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2384 if (Next == Parent->end()) { 2385 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2386 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2387 return nullptr; 2388 UncondBrTarget = &*NextMBB; 2389 } else { 2390 if (Next->getOpcode() != AMDGPU::G_BR) 2391 return nullptr; 2392 Br = &*Next; 2393 UncondBrTarget = Br->getOperand(0).getMBB(); 2394 } 2395 2396 return &UseMI; 2397 } 2398 2399 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2400 MachineRegisterInfo &MRI, 2401 Register LiveIn, 2402 Register PhyReg) const { 2403 assert(PhyReg.isPhysical() && "Physical register expected"); 2404 2405 // Insert the live-in copy, if required, by defining destination virtual 2406 // register. 2407 // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2408 if (!MRI.getVRegDef(LiveIn)) { 2409 // FIXME: Should have scoped insert pt 2410 MachineBasicBlock &OrigInsBB = B.getMBB(); 2411 auto OrigInsPt = B.getInsertPt(); 2412 2413 MachineBasicBlock &EntryMBB = B.getMF().front(); 2414 EntryMBB.addLiveIn(PhyReg); 2415 B.setInsertPt(EntryMBB, EntryMBB.begin()); 2416 B.buildCopy(LiveIn, PhyReg); 2417 2418 B.setInsertPt(OrigInsBB, OrigInsPt); 2419 } 2420 2421 return LiveIn; 2422 } 2423 2424 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2425 MachineRegisterInfo &MRI, 2426 Register PhyReg, LLT Ty, 2427 bool InsertLiveInCopy) const { 2428 assert(PhyReg.isPhysical() && "Physical register expected"); 2429 2430 // Get or create virtual live-in regester 2431 Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2432 if (!LiveIn) { 2433 LiveIn = MRI.createGenericVirtualRegister(Ty); 2434 MRI.addLiveIn(PhyReg, LiveIn); 2435 } 2436 2437 // When the actual true copy required is from virtual register to physical 2438 // register (to be inserted later), live-in copy insertion from physical 2439 // to register virtual register is not required 2440 if (!InsertLiveInCopy) 2441 return LiveIn; 2442 2443 return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2444 } 2445 2446 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2447 MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2448 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2449 const ArgDescriptor *Arg; 2450 const TargetRegisterClass *RC; 2451 LLT ArgTy; 2452 std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2453 if (!Arg) { 2454 LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2455 return nullptr; 2456 } 2457 return Arg; 2458 } 2459 2460 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2461 const ArgDescriptor *Arg) const { 2462 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2463 return false; // TODO: Handle these 2464 2465 Register SrcReg = Arg->getRegister(); 2466 assert(SrcReg.isPhysical() && "Physical register expected"); 2467 assert(DstReg.isVirtual() && "Virtual register expected"); 2468 2469 MachineRegisterInfo &MRI = *B.getMRI(); 2470 2471 LLT Ty = MRI.getType(DstReg); 2472 Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 2473 2474 if (Arg->isMasked()) { 2475 // TODO: Should we try to emit this once in the entry block? 2476 const LLT S32 = LLT::scalar(32); 2477 const unsigned Mask = Arg->getMask(); 2478 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2479 2480 Register AndMaskSrc = LiveIn; 2481 2482 if (Shift != 0) { 2483 auto ShiftAmt = B.buildConstant(S32, Shift); 2484 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2485 } 2486 2487 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2488 } else { 2489 B.buildCopy(DstReg, LiveIn); 2490 } 2491 2492 return true; 2493 } 2494 2495 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2496 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2497 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2498 2499 const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2500 if (!Arg) 2501 return false; 2502 2503 if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2504 return false; 2505 2506 MI.eraseFromParent(); 2507 return true; 2508 } 2509 2510 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2511 MachineRegisterInfo &MRI, 2512 MachineIRBuilder &B) const { 2513 Register Dst = MI.getOperand(0).getReg(); 2514 LLT DstTy = MRI.getType(Dst); 2515 LLT S16 = LLT::scalar(16); 2516 LLT S32 = LLT::scalar(32); 2517 LLT S64 = LLT::scalar(64); 2518 2519 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 2520 return true; 2521 2522 if (DstTy == S16) 2523 return legalizeFDIV16(MI, MRI, B); 2524 if (DstTy == S32) 2525 return legalizeFDIV32(MI, MRI, B); 2526 if (DstTy == S64) 2527 return legalizeFDIV64(MI, MRI, B); 2528 2529 return false; 2530 } 2531 2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2533 Register DstReg, 2534 Register X, 2535 Register Y, 2536 bool IsDiv) const { 2537 const LLT S1 = LLT::scalar(1); 2538 const LLT S32 = LLT::scalar(32); 2539 2540 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2541 // algorithm used here. 2542 2543 // Initial estimate of inv(y). 2544 auto FloatY = B.buildUITOFP(S32, Y); 2545 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2546 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2547 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2548 auto Z = B.buildFPTOUI(S32, ScaledY); 2549 2550 // One round of UNR. 2551 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2552 auto NegYZ = B.buildMul(S32, NegY, Z); 2553 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2554 2555 // Quotient/remainder estimate. 2556 auto Q = B.buildUMulH(S32, X, Z); 2557 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2558 2559 // First quotient/remainder refinement. 2560 auto One = B.buildConstant(S32, 1); 2561 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2562 if (IsDiv) 2563 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2564 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2565 2566 // Second quotient/remainder refinement. 2567 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2568 if (IsDiv) 2569 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2570 else 2571 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2572 } 2573 2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2575 MachineRegisterInfo &MRI, 2576 MachineIRBuilder &B) const { 2577 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2578 Register DstReg = MI.getOperand(0).getReg(); 2579 Register Num = MI.getOperand(1).getReg(); 2580 Register Den = MI.getOperand(2).getReg(); 2581 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2582 MI.eraseFromParent(); 2583 return true; 2584 } 2585 2586 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2587 // 2588 // Return lo, hi of result 2589 // 2590 // %cvt.lo = G_UITOFP Val.lo 2591 // %cvt.hi = G_UITOFP Val.hi 2592 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2593 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2594 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2595 // %mul2 = G_FMUL %mul1, 2**(-32) 2596 // %trunc = G_INTRINSIC_TRUNC %mul2 2597 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2598 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2599 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2600 Register Val) { 2601 const LLT S32 = LLT::scalar(32); 2602 auto Unmerge = B.buildUnmerge(S32, Val); 2603 2604 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2605 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2606 2607 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2608 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2609 2610 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2611 auto Mul1 = 2612 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2613 2614 // 2**(-32) 2615 auto Mul2 = 2616 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2617 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2618 2619 // -(2**32) 2620 auto Mad2 = B.buildFMAD(S32, Trunc, 2621 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2622 2623 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2624 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2625 2626 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2627 } 2628 2629 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2630 Register DstReg, 2631 Register Numer, 2632 Register Denom, 2633 bool IsDiv) const { 2634 const LLT S32 = LLT::scalar(32); 2635 const LLT S64 = LLT::scalar(64); 2636 const LLT S1 = LLT::scalar(1); 2637 Register RcpLo, RcpHi; 2638 2639 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2640 2641 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2642 2643 auto Zero64 = B.buildConstant(S64, 0); 2644 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2645 2646 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2647 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2648 2649 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2650 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2651 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2652 2653 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2654 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2655 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2656 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2657 2658 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2659 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2660 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2661 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2662 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2663 2664 auto Zero32 = B.buildConstant(S32, 0); 2665 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2666 auto Add2_HiC = 2667 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2668 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2669 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2670 2671 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2672 Register NumerLo = UnmergeNumer.getReg(0); 2673 Register NumerHi = UnmergeNumer.getReg(1); 2674 2675 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2676 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2677 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2678 Register Mul3_Lo = UnmergeMul3.getReg(0); 2679 Register Mul3_Hi = UnmergeMul3.getReg(1); 2680 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2681 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2682 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2683 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2684 2685 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2686 Register DenomLo = UnmergeDenom.getReg(0); 2687 Register DenomHi = UnmergeDenom.getReg(1); 2688 2689 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2690 auto C1 = B.buildSExt(S32, CmpHi); 2691 2692 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2693 auto C2 = B.buildSExt(S32, CmpLo); 2694 2695 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2696 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2697 2698 // TODO: Here and below portions of the code can be enclosed into if/endif. 2699 // Currently control flow is unconditional and we have 4 selects after 2700 // potential endif to substitute PHIs. 2701 2702 // if C3 != 0 ... 2703 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2704 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2705 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2706 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2707 2708 auto One64 = B.buildConstant(S64, 1); 2709 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2710 2711 auto C4 = 2712 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2713 auto C5 = 2714 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2715 auto C6 = B.buildSelect( 2716 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2717 2718 // if (C6 != 0) 2719 auto Add4 = B.buildAdd(S64, Add3, One64); 2720 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2721 2722 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2723 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2724 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2725 2726 // endif C6 2727 // endif C3 2728 2729 if (IsDiv) { 2730 auto Sel1 = B.buildSelect( 2731 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2732 B.buildSelect(DstReg, 2733 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2734 } else { 2735 auto Sel2 = B.buildSelect( 2736 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2737 B.buildSelect(DstReg, 2738 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2739 } 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2743 MachineRegisterInfo &MRI, 2744 MachineIRBuilder &B) const { 2745 const LLT S64 = LLT::scalar(64); 2746 const LLT S32 = LLT::scalar(32); 2747 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2748 Register DstReg = MI.getOperand(0).getReg(); 2749 Register Num = MI.getOperand(1).getReg(); 2750 Register Den = MI.getOperand(2).getReg(); 2751 LLT Ty = MRI.getType(DstReg); 2752 2753 if (Ty == S32) 2754 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2755 else if (Ty == S64) 2756 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2757 else 2758 return false; 2759 2760 MI.eraseFromParent(); 2761 return true; 2762 2763 } 2764 2765 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2766 MachineRegisterInfo &MRI, 2767 MachineIRBuilder &B) const { 2768 const LLT S64 = LLT::scalar(64); 2769 const LLT S32 = LLT::scalar(32); 2770 2771 Register DstReg = MI.getOperand(0).getReg(); 2772 const LLT Ty = MRI.getType(DstReg); 2773 if (Ty != S32 && Ty != S64) 2774 return false; 2775 2776 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2777 2778 Register LHS = MI.getOperand(1).getReg(); 2779 Register RHS = MI.getOperand(2).getReg(); 2780 2781 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2782 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2783 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2784 2785 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2786 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2787 2788 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2789 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2790 2791 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2792 if (Ty == S32) 2793 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2794 else 2795 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2796 2797 Register Sign; 2798 if (IsDiv) 2799 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2800 else 2801 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2802 2803 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2804 B.buildSub(DstReg, UDivRem, Sign); 2805 2806 MI.eraseFromParent(); 2807 return true; 2808 } 2809 2810 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 2811 MachineRegisterInfo &MRI, 2812 MachineIRBuilder &B) const { 2813 Register Res = MI.getOperand(0).getReg(); 2814 Register LHS = MI.getOperand(1).getReg(); 2815 Register RHS = MI.getOperand(2).getReg(); 2816 2817 uint16_t Flags = MI.getFlags(); 2818 2819 LLT ResTy = MRI.getType(Res); 2820 LLT S32 = LLT::scalar(32); 2821 LLT S64 = LLT::scalar(64); 2822 2823 const MachineFunction &MF = B.getMF(); 2824 bool Unsafe = 2825 MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 2826 2827 if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 2828 return false; 2829 2830 if (!Unsafe && ResTy == S32 && 2831 MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 2832 return false; 2833 2834 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 2835 // 1 / x -> RCP(x) 2836 if (CLHS->isExactlyValue(1.0)) { 2837 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2838 .addUse(RHS) 2839 .setMIFlags(Flags); 2840 2841 MI.eraseFromParent(); 2842 return true; 2843 } 2844 2845 // -1 / x -> RCP( FNEG(x) ) 2846 if (CLHS->isExactlyValue(-1.0)) { 2847 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 2848 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 2849 .addUse(FNeg.getReg(0)) 2850 .setMIFlags(Flags); 2851 2852 MI.eraseFromParent(); 2853 return true; 2854 } 2855 } 2856 2857 // x / y -> x * (1.0 / y) 2858 if (Unsafe) { 2859 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 2860 .addUse(RHS) 2861 .setMIFlags(Flags); 2862 B.buildFMul(Res, LHS, RCP, Flags); 2863 2864 MI.eraseFromParent(); 2865 return true; 2866 } 2867 2868 return false; 2869 } 2870 2871 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2872 MachineRegisterInfo &MRI, 2873 MachineIRBuilder &B) const { 2874 Register Res = MI.getOperand(0).getReg(); 2875 Register LHS = MI.getOperand(1).getReg(); 2876 Register RHS = MI.getOperand(2).getReg(); 2877 2878 uint16_t Flags = MI.getFlags(); 2879 2880 LLT S16 = LLT::scalar(16); 2881 LLT S32 = LLT::scalar(32); 2882 2883 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2884 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2885 2886 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2887 .addUse(RHSExt.getReg(0)) 2888 .setMIFlags(Flags); 2889 2890 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2891 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2892 2893 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2894 .addUse(RDst.getReg(0)) 2895 .addUse(RHS) 2896 .addUse(LHS) 2897 .setMIFlags(Flags); 2898 2899 MI.eraseFromParent(); 2900 return true; 2901 } 2902 2903 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2904 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2905 static void toggleSPDenormMode(bool Enable, 2906 MachineIRBuilder &B, 2907 const GCNSubtarget &ST, 2908 AMDGPU::SIModeRegisterDefaults Mode) { 2909 // Set SP denorm mode to this value. 2910 unsigned SPDenormMode = 2911 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2912 2913 if (ST.hasDenormModeInst()) { 2914 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2915 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2916 2917 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2918 B.buildInstr(AMDGPU::S_DENORM_MODE) 2919 .addImm(NewDenormModeValue); 2920 2921 } else { 2922 // Select FP32 bit field in mode register. 2923 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2924 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2925 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2926 2927 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2928 .addImm(SPDenormMode) 2929 .addImm(SPDenormModeBitField); 2930 } 2931 } 2932 2933 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2934 MachineRegisterInfo &MRI, 2935 MachineIRBuilder &B) const { 2936 Register Res = MI.getOperand(0).getReg(); 2937 Register LHS = MI.getOperand(1).getReg(); 2938 Register RHS = MI.getOperand(2).getReg(); 2939 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2940 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2941 2942 uint16_t Flags = MI.getFlags(); 2943 2944 LLT S32 = LLT::scalar(32); 2945 LLT S1 = LLT::scalar(1); 2946 2947 auto One = B.buildFConstant(S32, 1.0f); 2948 2949 auto DenominatorScaled = 2950 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2951 .addUse(LHS) 2952 .addUse(RHS) 2953 .addImm(0) 2954 .setMIFlags(Flags); 2955 auto NumeratorScaled = 2956 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2957 .addUse(LHS) 2958 .addUse(RHS) 2959 .addImm(1) 2960 .setMIFlags(Flags); 2961 2962 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2963 .addUse(DenominatorScaled.getReg(0)) 2964 .setMIFlags(Flags); 2965 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2966 2967 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2968 // aren't modeled as reading it. 2969 if (!Mode.allFP32Denormals()) 2970 toggleSPDenormMode(true, B, ST, Mode); 2971 2972 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2973 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2974 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2975 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2976 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2977 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2978 2979 if (!Mode.allFP32Denormals()) 2980 toggleSPDenormMode(false, B, ST, Mode); 2981 2982 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2983 .addUse(Fma4.getReg(0)) 2984 .addUse(Fma1.getReg(0)) 2985 .addUse(Fma3.getReg(0)) 2986 .addUse(NumeratorScaled.getReg(1)) 2987 .setMIFlags(Flags); 2988 2989 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2990 .addUse(Fmas.getReg(0)) 2991 .addUse(RHS) 2992 .addUse(LHS) 2993 .setMIFlags(Flags); 2994 2995 MI.eraseFromParent(); 2996 return true; 2997 } 2998 2999 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3000 MachineRegisterInfo &MRI, 3001 MachineIRBuilder &B) const { 3002 Register Res = MI.getOperand(0).getReg(); 3003 Register LHS = MI.getOperand(1).getReg(); 3004 Register RHS = MI.getOperand(2).getReg(); 3005 3006 uint16_t Flags = MI.getFlags(); 3007 3008 LLT S64 = LLT::scalar(64); 3009 LLT S1 = LLT::scalar(1); 3010 3011 auto One = B.buildFConstant(S64, 1.0); 3012 3013 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3014 .addUse(LHS) 3015 .addUse(RHS) 3016 .addImm(0) 3017 .setMIFlags(Flags); 3018 3019 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3020 3021 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3022 .addUse(DivScale0.getReg(0)) 3023 .setMIFlags(Flags); 3024 3025 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3026 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3027 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3028 3029 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3030 .addUse(LHS) 3031 .addUse(RHS) 3032 .addImm(1) 3033 .setMIFlags(Flags); 3034 3035 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3036 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3037 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3038 3039 Register Scale; 3040 if (!ST.hasUsableDivScaleConditionOutput()) { 3041 // Workaround a hardware bug on SI where the condition output from div_scale 3042 // is not usable. 3043 3044 LLT S32 = LLT::scalar(32); 3045 3046 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3047 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3048 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3049 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3050 3051 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3052 Scale1Unmerge.getReg(1)); 3053 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3054 Scale0Unmerge.getReg(1)); 3055 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3056 } else { 3057 Scale = DivScale1.getReg(1); 3058 } 3059 3060 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3061 .addUse(Fma4.getReg(0)) 3062 .addUse(Fma3.getReg(0)) 3063 .addUse(Mul.getReg(0)) 3064 .addUse(Scale) 3065 .setMIFlags(Flags); 3066 3067 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3068 .addUse(Fmas.getReg(0)) 3069 .addUse(RHS) 3070 .addUse(LHS) 3071 .setMIFlags(Flags); 3072 3073 MI.eraseFromParent(); 3074 return true; 3075 } 3076 3077 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3078 MachineRegisterInfo &MRI, 3079 MachineIRBuilder &B) const { 3080 Register Res = MI.getOperand(0).getReg(); 3081 Register LHS = MI.getOperand(2).getReg(); 3082 Register RHS = MI.getOperand(3).getReg(); 3083 uint16_t Flags = MI.getFlags(); 3084 3085 LLT S32 = LLT::scalar(32); 3086 LLT S1 = LLT::scalar(1); 3087 3088 auto Abs = B.buildFAbs(S32, RHS, Flags); 3089 const APFloat C0Val(1.0f); 3090 3091 auto C0 = B.buildConstant(S32, 0x6f800000); 3092 auto C1 = B.buildConstant(S32, 0x2f800000); 3093 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3094 3095 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3096 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3097 3098 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3099 3100 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3101 .addUse(Mul0.getReg(0)) 3102 .setMIFlags(Flags); 3103 3104 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3105 3106 B.buildFMul(Res, Sel, Mul1, Flags); 3107 3108 MI.eraseFromParent(); 3109 return true; 3110 } 3111 3112 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3113 MachineRegisterInfo &MRI, 3114 MachineIRBuilder &B) const { 3115 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3116 if (!MFI->isEntryFunction()) { 3117 return legalizePreloadedArgIntrin(MI, MRI, B, 3118 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3119 } 3120 3121 uint64_t Offset = 3122 ST.getTargetLowering()->getImplicitParameterOffset( 3123 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3124 Register DstReg = MI.getOperand(0).getReg(); 3125 LLT DstTy = MRI.getType(DstReg); 3126 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3127 3128 const ArgDescriptor *Arg; 3129 const TargetRegisterClass *RC; 3130 LLT ArgTy; 3131 std::tie(Arg, RC, ArgTy) = 3132 MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 3133 if (!Arg) 3134 return false; 3135 3136 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3137 if (!loadInputValue(KernargPtrReg, B, Arg)) 3138 return false; 3139 3140 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3141 MI.eraseFromParent(); 3142 return true; 3143 } 3144 3145 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3146 MachineRegisterInfo &MRI, 3147 MachineIRBuilder &B, 3148 unsigned AddrSpace) const { 3149 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3150 auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 3151 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3152 MI.eraseFromParent(); 3153 return true; 3154 } 3155 3156 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3157 // offset (the offset that is included in bounds checking and swizzling, to be 3158 // split between the instruction's voffset and immoffset fields) and soffset 3159 // (the offset that is excluded from bounds checking and swizzling, to go in 3160 // the instruction's soffset field). This function takes the first kind of 3161 // offset and figures out how to split it between voffset and immoffset. 3162 std::tuple<Register, unsigned, unsigned> 3163 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3164 Register OrigOffset) const { 3165 const unsigned MaxImm = 4095; 3166 Register BaseReg; 3167 unsigned TotalConstOffset; 3168 MachineInstr *OffsetDef; 3169 const LLT S32 = LLT::scalar(32); 3170 3171 std::tie(BaseReg, TotalConstOffset, OffsetDef) 3172 = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3173 3174 unsigned ImmOffset = TotalConstOffset; 3175 3176 // If the immediate value is too big for the immoffset field, put the value 3177 // and -4096 into the immoffset field so that the value that is copied/added 3178 // for the voffset field is a multiple of 4096, and it stands more chance 3179 // of being CSEd with the copy/add for another similar load/store. 3180 // However, do not do that rounding down to a multiple of 4096 if that is a 3181 // negative number, as it appears to be illegal to have a negative offset 3182 // in the vgpr, even if adding the immediate offset makes it positive. 3183 unsigned Overflow = ImmOffset & ~MaxImm; 3184 ImmOffset -= Overflow; 3185 if ((int32_t)Overflow < 0) { 3186 Overflow += ImmOffset; 3187 ImmOffset = 0; 3188 } 3189 3190 if (Overflow != 0) { 3191 if (!BaseReg) { 3192 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3193 } else { 3194 auto OverflowVal = B.buildConstant(S32, Overflow); 3195 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3196 } 3197 } 3198 3199 if (!BaseReg) 3200 BaseReg = B.buildConstant(S32, 0).getReg(0); 3201 3202 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3203 } 3204 3205 /// Handle register layout difference for f16 images for some subtargets. 3206 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3207 MachineRegisterInfo &MRI, 3208 Register Reg) const { 3209 if (!ST.hasUnpackedD16VMem()) 3210 return Reg; 3211 3212 const LLT S16 = LLT::scalar(16); 3213 const LLT S32 = LLT::scalar(32); 3214 LLT StoreVT = MRI.getType(Reg); 3215 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3216 3217 auto Unmerge = B.buildUnmerge(S16, Reg); 3218 3219 SmallVector<Register, 4> WideRegs; 3220 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3221 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3222 3223 int NumElts = StoreVT.getNumElements(); 3224 3225 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3226 } 3227 3228 Register AMDGPULegalizerInfo::fixStoreSourceType( 3229 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3230 MachineRegisterInfo *MRI = B.getMRI(); 3231 LLT Ty = MRI->getType(VData); 3232 3233 const LLT S16 = LLT::scalar(16); 3234 3235 // Fixup illegal register types for i8 stores. 3236 if (Ty == LLT::scalar(8) || Ty == S16) { 3237 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3238 return AnyExt; 3239 } 3240 3241 if (Ty.isVector()) { 3242 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3243 if (IsFormat) 3244 return handleD16VData(B, *MRI, VData); 3245 } 3246 } 3247 3248 return VData; 3249 } 3250 3251 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3252 MachineRegisterInfo &MRI, 3253 MachineIRBuilder &B, 3254 bool IsTyped, 3255 bool IsFormat) const { 3256 Register VData = MI.getOperand(1).getReg(); 3257 LLT Ty = MRI.getType(VData); 3258 LLT EltTy = Ty.getScalarType(); 3259 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3260 const LLT S32 = LLT::scalar(32); 3261 3262 VData = fixStoreSourceType(B, VData, IsFormat); 3263 Register RSrc = MI.getOperand(2).getReg(); 3264 3265 MachineMemOperand *MMO = *MI.memoperands_begin(); 3266 const int MemSize = MMO->getSize(); 3267 3268 unsigned ImmOffset; 3269 unsigned TotalOffset; 3270 3271 // The typed intrinsics add an immediate after the registers. 3272 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3273 3274 // The struct intrinsic variants add one additional operand over raw. 3275 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3276 Register VIndex; 3277 int OpOffset = 0; 3278 if (HasVIndex) { 3279 VIndex = MI.getOperand(3).getReg(); 3280 OpOffset = 1; 3281 } 3282 3283 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3284 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3285 3286 unsigned Format = 0; 3287 if (IsTyped) { 3288 Format = MI.getOperand(5 + OpOffset).getImm(); 3289 ++OpOffset; 3290 } 3291 3292 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3293 3294 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3295 if (TotalOffset != 0) 3296 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3297 3298 unsigned Opc; 3299 if (IsTyped) { 3300 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3301 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3302 } else if (IsFormat) { 3303 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3304 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3305 } else { 3306 switch (MemSize) { 3307 case 1: 3308 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3309 break; 3310 case 2: 3311 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3312 break; 3313 default: 3314 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3315 break; 3316 } 3317 } 3318 3319 if (!VIndex) 3320 VIndex = B.buildConstant(S32, 0).getReg(0); 3321 3322 auto MIB = B.buildInstr(Opc) 3323 .addUse(VData) // vdata 3324 .addUse(RSrc) // rsrc 3325 .addUse(VIndex) // vindex 3326 .addUse(VOffset) // voffset 3327 .addUse(SOffset) // soffset 3328 .addImm(ImmOffset); // offset(imm) 3329 3330 if (IsTyped) 3331 MIB.addImm(Format); 3332 3333 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3334 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3335 .addMemOperand(MMO); 3336 3337 MI.eraseFromParent(); 3338 return true; 3339 } 3340 3341 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3342 MachineRegisterInfo &MRI, 3343 MachineIRBuilder &B, 3344 bool IsFormat, 3345 bool IsTyped) const { 3346 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3347 MachineMemOperand *MMO = *MI.memoperands_begin(); 3348 const int MemSize = MMO->getSize(); 3349 const LLT S32 = LLT::scalar(32); 3350 3351 Register Dst = MI.getOperand(0).getReg(); 3352 Register RSrc = MI.getOperand(2).getReg(); 3353 3354 // The typed intrinsics add an immediate after the registers. 3355 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3356 3357 // The struct intrinsic variants add one additional operand over raw. 3358 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3359 Register VIndex; 3360 int OpOffset = 0; 3361 if (HasVIndex) { 3362 VIndex = MI.getOperand(3).getReg(); 3363 OpOffset = 1; 3364 } 3365 3366 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3367 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3368 3369 unsigned Format = 0; 3370 if (IsTyped) { 3371 Format = MI.getOperand(5 + OpOffset).getImm(); 3372 ++OpOffset; 3373 } 3374 3375 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3376 unsigned ImmOffset; 3377 unsigned TotalOffset; 3378 3379 LLT Ty = MRI.getType(Dst); 3380 LLT EltTy = Ty.getScalarType(); 3381 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3382 const bool Unpacked = ST.hasUnpackedD16VMem(); 3383 3384 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3385 if (TotalOffset != 0) 3386 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3387 3388 unsigned Opc; 3389 3390 if (IsTyped) { 3391 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3392 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3393 } else if (IsFormat) { 3394 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3395 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3396 } else { 3397 switch (MemSize) { 3398 case 1: 3399 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3400 break; 3401 case 2: 3402 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3403 break; 3404 default: 3405 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3406 break; 3407 } 3408 } 3409 3410 Register LoadDstReg; 3411 3412 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3413 LLT UnpackedTy = Ty.changeElementSize(32); 3414 3415 if (IsExtLoad) 3416 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3417 else if (Unpacked && IsD16 && Ty.isVector()) 3418 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3419 else 3420 LoadDstReg = Dst; 3421 3422 if (!VIndex) 3423 VIndex = B.buildConstant(S32, 0).getReg(0); 3424 3425 auto MIB = B.buildInstr(Opc) 3426 .addDef(LoadDstReg) // vdata 3427 .addUse(RSrc) // rsrc 3428 .addUse(VIndex) // vindex 3429 .addUse(VOffset) // voffset 3430 .addUse(SOffset) // soffset 3431 .addImm(ImmOffset); // offset(imm) 3432 3433 if (IsTyped) 3434 MIB.addImm(Format); 3435 3436 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3437 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3438 .addMemOperand(MMO); 3439 3440 if (LoadDstReg != Dst) { 3441 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3442 3443 // Widen result for extending loads was widened. 3444 if (IsExtLoad) 3445 B.buildTrunc(Dst, LoadDstReg); 3446 else { 3447 // Repack to original 16-bit vector result 3448 // FIXME: G_TRUNC should work, but legalization currently fails 3449 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3450 SmallVector<Register, 4> Repack; 3451 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3452 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3453 B.buildMerge(Dst, Repack); 3454 } 3455 } 3456 3457 MI.eraseFromParent(); 3458 return true; 3459 } 3460 3461 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3462 MachineIRBuilder &B, 3463 bool IsInc) const { 3464 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3465 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3466 B.buildInstr(Opc) 3467 .addDef(MI.getOperand(0).getReg()) 3468 .addUse(MI.getOperand(2).getReg()) 3469 .addUse(MI.getOperand(3).getReg()) 3470 .cloneMemRefs(MI); 3471 MI.eraseFromParent(); 3472 return true; 3473 } 3474 3475 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3476 switch (IntrID) { 3477 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3478 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3479 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3480 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3481 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3482 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3483 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3484 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3485 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3486 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3487 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3488 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3489 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3490 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3491 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3492 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3493 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3494 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3495 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3496 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3497 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3498 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3499 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3500 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3501 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3502 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3503 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3504 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3505 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3506 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3507 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3508 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3509 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3510 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3511 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3512 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3513 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3514 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3515 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3516 default: 3517 llvm_unreachable("unhandled atomic opcode"); 3518 } 3519 } 3520 3521 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3522 MachineIRBuilder &B, 3523 Intrinsic::ID IID) const { 3524 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3525 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3526 3527 Register Dst = MI.getOperand(0).getReg(); 3528 Register VData = MI.getOperand(2).getReg(); 3529 3530 Register CmpVal; 3531 int OpOffset = 0; 3532 3533 if (IsCmpSwap) { 3534 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3535 ++OpOffset; 3536 } 3537 3538 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3539 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3540 3541 // The struct intrinsic variants add one additional operand over raw. 3542 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3543 Register VIndex; 3544 if (HasVIndex) { 3545 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3546 ++OpOffset; 3547 } 3548 3549 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3550 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3551 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3552 3553 MachineMemOperand *MMO = *MI.memoperands_begin(); 3554 3555 unsigned ImmOffset; 3556 unsigned TotalOffset; 3557 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3558 if (TotalOffset != 0) 3559 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3560 3561 if (!VIndex) 3562 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3563 3564 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3565 .addDef(Dst) 3566 .addUse(VData); // vdata 3567 3568 if (IsCmpSwap) 3569 MIB.addReg(CmpVal); 3570 3571 MIB.addUse(RSrc) // rsrc 3572 .addUse(VIndex) // vindex 3573 .addUse(VOffset) // voffset 3574 .addUse(SOffset) // soffset 3575 .addImm(ImmOffset) // offset(imm) 3576 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3577 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3578 .addMemOperand(MMO); 3579 3580 MI.eraseFromParent(); 3581 return true; 3582 } 3583 3584 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3585 /// vector with s16 typed elements. 3586 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3587 SmallVectorImpl<Register> &PackedAddrs, 3588 int AddrIdx, int DimIdx, int EndIdx, 3589 int NumGradients) { 3590 const LLT S16 = LLT::scalar(16); 3591 const LLT V2S16 = LLT::vector(2, 16); 3592 3593 for (int I = AddrIdx; I < EndIdx; ++I) { 3594 MachineOperand &SrcOp = MI.getOperand(I); 3595 if (!SrcOp.isReg()) 3596 continue; // _L to _LZ may have eliminated this. 3597 3598 Register AddrReg = SrcOp.getReg(); 3599 3600 if (I < DimIdx) { 3601 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3602 PackedAddrs.push_back(AddrReg); 3603 } else { 3604 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3605 // derivatives dx/dh and dx/dv are packed with undef. 3606 if (((I + 1) >= EndIdx) || 3607 ((NumGradients / 2) % 2 == 1 && 3608 (I == DimIdx + (NumGradients / 2) - 1 || 3609 I == DimIdx + NumGradients - 1)) || 3610 // Check for _L to _LZ optimization 3611 !MI.getOperand(I + 1).isReg()) { 3612 PackedAddrs.push_back( 3613 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3614 .getReg(0)); 3615 } else { 3616 PackedAddrs.push_back( 3617 B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3618 .getReg(0)); 3619 ++I; 3620 } 3621 } 3622 } 3623 } 3624 3625 /// Convert from separate vaddr components to a single vector address register, 3626 /// and replace the remaining operands with $noreg. 3627 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3628 int DimIdx, int NumVAddrs) { 3629 const LLT S32 = LLT::scalar(32); 3630 3631 SmallVector<Register, 8> AddrRegs; 3632 for (int I = 0; I != NumVAddrs; ++I) { 3633 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3634 if (SrcOp.isReg()) { 3635 AddrRegs.push_back(SrcOp.getReg()); 3636 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3637 } 3638 } 3639 3640 int NumAddrRegs = AddrRegs.size(); 3641 if (NumAddrRegs != 1) { 3642 // Round up to 8 elements for v5-v7 3643 // FIXME: Missing intermediate sized register classes and instructions. 3644 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3645 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3646 auto Undef = B.buildUndef(S32); 3647 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3648 NumAddrRegs = RoundedNumRegs; 3649 } 3650 3651 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3652 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3653 } 3654 3655 for (int I = 1; I != NumVAddrs; ++I) { 3656 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3657 if (SrcOp.isReg()) 3658 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3659 } 3660 } 3661 3662 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3663 /// 3664 /// Depending on the subtarget, load/store with 16-bit element data need to be 3665 /// rewritten to use the low half of 32-bit registers, or directly use a packed 3666 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3667 /// registers. 3668 /// 3669 /// We don't want to directly select image instructions just yet, but also want 3670 /// to exposes all register repacking to the legalizer/combiners. We also don't 3671 /// want a selected instrution entering RegBankSelect. In order to avoid 3672 /// defining a multitude of intermediate image instructions, directly hack on 3673 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3674 /// now unnecessary arguments with $noreg. 3675 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3676 MachineInstr &MI, MachineIRBuilder &B, 3677 GISelChangeObserver &Observer, 3678 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3679 3680 const int NumDefs = MI.getNumExplicitDefs(); 3681 bool IsTFE = NumDefs == 2; 3682 // We are only processing the operands of d16 image operations on subtargets 3683 // that use the unpacked register layout, or need to repack the TFE result. 3684 3685 // TODO: Do we need to guard against already legalized intrinsics? 3686 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3687 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3688 3689 MachineRegisterInfo *MRI = B.getMRI(); 3690 const LLT S32 = LLT::scalar(32); 3691 const LLT S16 = LLT::scalar(16); 3692 const LLT V2S16 = LLT::vector(2, 16); 3693 3694 // Index of first address argument 3695 const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3696 3697 int NumVAddrs, NumGradients; 3698 std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3699 const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3700 getDMaskIdx(BaseOpcode, NumDefs); 3701 unsigned DMask = 0; 3702 3703 // Check for 16 bit addresses and pack if true. 3704 int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3705 LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3706 LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3707 const bool IsG16 = GradTy == S16; 3708 const bool IsA16 = AddrTy == S16; 3709 3710 int DMaskLanes = 0; 3711 if (!BaseOpcode->Atomic) { 3712 DMask = MI.getOperand(DMaskIdx).getImm(); 3713 if (BaseOpcode->Gather4) { 3714 DMaskLanes = 4; 3715 } else if (DMask != 0) { 3716 DMaskLanes = countPopulation(DMask); 3717 } else if (!IsTFE && !BaseOpcode->Store) { 3718 // If dmask is 0, this is a no-op load. This can be eliminated. 3719 B.buildUndef(MI.getOperand(0)); 3720 MI.eraseFromParent(); 3721 return true; 3722 } 3723 } 3724 3725 Observer.changingInstr(MI); 3726 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3727 3728 unsigned NewOpcode = NumDefs == 0 ? 3729 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3730 3731 // Track that we legalized this 3732 MI.setDesc(B.getTII().get(NewOpcode)); 3733 3734 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3735 // dmask to be at least 1 otherwise the instruction will fail 3736 if (IsTFE && DMask == 0) { 3737 DMask = 0x1; 3738 DMaskLanes = 1; 3739 MI.getOperand(DMaskIdx).setImm(DMask); 3740 } 3741 3742 if (BaseOpcode->Atomic) { 3743 Register VData0 = MI.getOperand(2).getReg(); 3744 LLT Ty = MRI->getType(VData0); 3745 3746 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3747 if (Ty.isVector()) 3748 return false; 3749 3750 if (BaseOpcode->AtomicX2) { 3751 Register VData1 = MI.getOperand(3).getReg(); 3752 // The two values are packed in one register. 3753 LLT PackedTy = LLT::vector(2, Ty); 3754 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3755 MI.getOperand(2).setReg(Concat.getReg(0)); 3756 MI.getOperand(3).setReg(AMDGPU::NoRegister); 3757 } 3758 } 3759 3760 int CorrectedNumVAddrs = NumVAddrs; 3761 3762 // Optimize _L to _LZ when _L is zero 3763 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3764 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3765 const ConstantFP *ConstantLod; 3766 const int LodIdx = AddrIdx + NumVAddrs - 1; 3767 3768 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3769 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3770 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3771 ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3772 LZMappingInfo->LZ, ImageDimIntr->Dim); 3773 3774 // The starting indexes should remain in the same place. 3775 --NumVAddrs; 3776 --CorrectedNumVAddrs; 3777 3778 MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3779 static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3780 MI.RemoveOperand(LodIdx); 3781 } 3782 } 3783 } 3784 3785 // Optimize _mip away, when 'lod' is zero 3786 if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3787 int64_t ConstantLod; 3788 const int LodIdx = AddrIdx + NumVAddrs - 1; 3789 3790 if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3791 if (ConstantLod == 0) { 3792 // TODO: Change intrinsic opcode and remove operand instead or replacing 3793 // it with 0, as the _L to _LZ handling is done above. 3794 MI.getOperand(LodIdx).ChangeToImmediate(0); 3795 --CorrectedNumVAddrs; 3796 } 3797 } 3798 } 3799 3800 // Rewrite the addressing register layout before doing anything else. 3801 if (IsA16 || IsG16) { 3802 if (IsA16) { 3803 // Target must support the feature and gradients need to be 16 bit too 3804 if (!ST.hasA16() || !IsG16) 3805 return false; 3806 } else if (!ST.hasG16()) 3807 return false; 3808 3809 if (NumVAddrs > 1) { 3810 SmallVector<Register, 4> PackedRegs; 3811 // Don't compress addresses for G16 3812 const int PackEndIdx = 3813 IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3814 packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3815 PackEndIdx, NumGradients); 3816 3817 if (!IsA16) { 3818 // Add uncompressed address 3819 for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3820 int AddrReg = MI.getOperand(I).getReg(); 3821 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3822 PackedRegs.push_back(AddrReg); 3823 } 3824 } 3825 3826 // See also below in the non-a16 branch 3827 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3828 3829 if (!UseNSA && PackedRegs.size() > 1) { 3830 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3831 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3832 PackedRegs[0] = Concat.getReg(0); 3833 PackedRegs.resize(1); 3834 } 3835 3836 const int NumPacked = PackedRegs.size(); 3837 for (int I = 0; I != NumVAddrs; ++I) { 3838 MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3839 if (!SrcOp.isReg()) { 3840 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3841 continue; 3842 } 3843 3844 assert(SrcOp.getReg() != AMDGPU::NoRegister); 3845 3846 if (I < NumPacked) 3847 SrcOp.setReg(PackedRegs[I]); 3848 else 3849 SrcOp.setReg(AMDGPU::NoRegister); 3850 } 3851 } 3852 } else { 3853 // If the register allocator cannot place the address registers contiguously 3854 // without introducing moves, then using the non-sequential address encoding 3855 // is always preferable, since it saves VALU instructions and is usually a 3856 // wash in terms of code size or even better. 3857 // 3858 // However, we currently have no way of hinting to the register allocator 3859 // that MIMG addresses should be placed contiguously when it is possible to 3860 // do so, so force non-NSA for the common 2-address case as a heuristic. 3861 // 3862 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3863 // allocation when possible. 3864 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3865 3866 if (!UseNSA && NumVAddrs > 1) 3867 convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3868 } 3869 3870 int Flags = 0; 3871 if (IsA16) 3872 Flags |= 1; 3873 if (IsG16) 3874 Flags |= 2; 3875 MI.addOperand(MachineOperand::CreateImm(Flags)); 3876 3877 if (BaseOpcode->Store) { // No TFE for stores? 3878 // TODO: Handle dmask trim 3879 Register VData = MI.getOperand(1).getReg(); 3880 LLT Ty = MRI->getType(VData); 3881 if (!Ty.isVector() || Ty.getElementType() != S16) 3882 return true; 3883 3884 Register RepackedReg = handleD16VData(B, *MRI, VData); 3885 if (RepackedReg != VData) { 3886 MI.getOperand(1).setReg(RepackedReg); 3887 } 3888 3889 return true; 3890 } 3891 3892 Register DstReg = MI.getOperand(0).getReg(); 3893 LLT Ty = MRI->getType(DstReg); 3894 const LLT EltTy = Ty.getScalarType(); 3895 const bool IsD16 = Ty.getScalarType() == S16; 3896 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3897 3898 // Confirm that the return type is large enough for the dmask specified 3899 if (NumElts < DMaskLanes) 3900 return false; 3901 3902 if (NumElts > 4 || DMaskLanes > 4) 3903 return false; 3904 3905 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3906 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3907 3908 // The raw dword aligned data component of the load. The only legal cases 3909 // where this matters should be when using the packed D16 format, for 3910 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3911 LLT RoundedTy; 3912 3913 // S32 vector to to cover all data, plus TFE result element. 3914 LLT TFETy; 3915 3916 // Register type to use for each loaded component. Will be S32 or V2S16. 3917 LLT RegTy; 3918 3919 if (IsD16 && ST.hasUnpackedD16VMem()) { 3920 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3921 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3922 RegTy = S32; 3923 } else { 3924 unsigned EltSize = EltTy.getSizeInBits(); 3925 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3926 unsigned RoundedSize = 32 * RoundedElts; 3927 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3928 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3929 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3930 } 3931 3932 // The return type does not need adjustment. 3933 // TODO: Should we change s16 case to s32 or <2 x s16>? 3934 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3935 return true; 3936 3937 Register Dst1Reg; 3938 3939 // Insert after the instruction. 3940 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3941 3942 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3943 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3944 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3945 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3946 3947 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3948 3949 MI.getOperand(0).setReg(NewResultReg); 3950 3951 // In the IR, TFE is supposed to be used with a 2 element struct return 3952 // type. The intruction really returns these two values in one contiguous 3953 // register, with one additional dword beyond the loaded data. Rewrite the 3954 // return type to use a single register result. 3955 3956 if (IsTFE) { 3957 Dst1Reg = MI.getOperand(1).getReg(); 3958 if (MRI->getType(Dst1Reg) != S32) 3959 return false; 3960 3961 // TODO: Make sure the TFE operand bit is set. 3962 MI.RemoveOperand(1); 3963 3964 // Handle the easy case that requires no repack instructions. 3965 if (Ty == S32) { 3966 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3967 return true; 3968 } 3969 } 3970 3971 // Now figure out how to copy the new result register back into the old 3972 // result. 3973 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3974 3975 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3976 3977 if (ResultNumRegs == 1) { 3978 assert(!IsTFE); 3979 ResultRegs[0] = NewResultReg; 3980 } else { 3981 // We have to repack into a new vector of some kind. 3982 for (int I = 0; I != NumDataRegs; ++I) 3983 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3984 B.buildUnmerge(ResultRegs, NewResultReg); 3985 3986 // Drop the final TFE element to get the data part. The TFE result is 3987 // directly written to the right place already. 3988 if (IsTFE) 3989 ResultRegs.resize(NumDataRegs); 3990 } 3991 3992 // For an s16 scalar result, we form an s32 result with a truncate regardless 3993 // of packed vs. unpacked. 3994 if (IsD16 && !Ty.isVector()) { 3995 B.buildTrunc(DstReg, ResultRegs[0]); 3996 return true; 3997 } 3998 3999 // Avoid a build/concat_vector of 1 entry. 4000 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4001 B.buildBitcast(DstReg, ResultRegs[0]); 4002 return true; 4003 } 4004 4005 assert(Ty.isVector()); 4006 4007 if (IsD16) { 4008 // For packed D16 results with TFE enabled, all the data components are 4009 // S32. Cast back to the expected type. 4010 // 4011 // TODO: We don't really need to use load s32 elements. We would only need one 4012 // cast for the TFE result if a multiple of v2s16 was used. 4013 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4014 for (Register &Reg : ResultRegs) 4015 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4016 } else if (ST.hasUnpackedD16VMem()) { 4017 for (Register &Reg : ResultRegs) 4018 Reg = B.buildTrunc(S16, Reg).getReg(0); 4019 } 4020 } 4021 4022 auto padWithUndef = [&](LLT Ty, int NumElts) { 4023 if (NumElts == 0) 4024 return; 4025 Register Undef = B.buildUndef(Ty).getReg(0); 4026 for (int I = 0; I != NumElts; ++I) 4027 ResultRegs.push_back(Undef); 4028 }; 4029 4030 // Pad out any elements eliminated due to the dmask. 4031 LLT ResTy = MRI->getType(ResultRegs[0]); 4032 if (!ResTy.isVector()) { 4033 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4034 B.buildBuildVector(DstReg, ResultRegs); 4035 return true; 4036 } 4037 4038 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4039 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4040 4041 // Deal with the one annoying legal case. 4042 const LLT V3S16 = LLT::vector(3, 16); 4043 if (Ty == V3S16) { 4044 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4045 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4046 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4047 return true; 4048 } 4049 4050 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4051 B.buildConcatVectors(DstReg, ResultRegs); 4052 return true; 4053 } 4054 4055 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4056 MachineInstr &MI, MachineIRBuilder &B, 4057 GISelChangeObserver &Observer) const { 4058 Register Dst = MI.getOperand(0).getReg(); 4059 LLT Ty = B.getMRI()->getType(Dst); 4060 unsigned Size = Ty.getSizeInBits(); 4061 MachineFunction &MF = B.getMF(); 4062 4063 Observer.changingInstr(MI); 4064 4065 // FIXME: We don't really need this intermediate instruction. The intrinsic 4066 // should be fixed to have a memory operand. Since it's readnone, we're not 4067 // allowed to add one. 4068 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4069 MI.RemoveOperand(1); // Remove intrinsic ID 4070 4071 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4072 // TODO: Should this use datalayout alignment? 4073 const unsigned MemSize = (Size + 7) / 8; 4074 const Align MemAlign(4); 4075 MachineMemOperand *MMO = MF.getMachineMemOperand( 4076 MachinePointerInfo(), 4077 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4078 MachineMemOperand::MOInvariant, 4079 MemSize, MemAlign); 4080 MI.addMemOperand(MF, MMO); 4081 4082 // There are no 96-bit result scalar loads, but widening to 128-bit should 4083 // always be legal. We may need to restore this to a 96-bit result if it turns 4084 // out this needs to be converted to a vector load during RegBankSelect. 4085 if (!isPowerOf2_32(Size)) { 4086 LegalizerHelper Helper(MF, *this, Observer, B); 4087 4088 if (Ty.isVector()) 4089 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4090 else 4091 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4092 } 4093 4094 Observer.changedInstr(MI); 4095 return true; 4096 } 4097 4098 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4099 MachineRegisterInfo &MRI, 4100 MachineIRBuilder &B) const { 4101 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4102 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4103 !ST.isTrapHandlerEnabled()) { 4104 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4105 } else { 4106 // Pass queue pointer to trap handler as input, and insert trap instruction 4107 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4108 const ArgDescriptor *Arg = 4109 getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4110 if (!Arg) 4111 return false; 4112 MachineRegisterInfo &MRI = *B.getMRI(); 4113 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4114 Register LiveIn = getLiveInRegister( 4115 B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4116 /*InsertLiveInCopy=*/false); 4117 if (!loadInputValue(LiveIn, B, Arg)) 4118 return false; 4119 B.buildCopy(SGPR01, LiveIn); 4120 B.buildInstr(AMDGPU::S_TRAP) 4121 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4122 .addReg(SGPR01, RegState::Implicit); 4123 } 4124 4125 MI.eraseFromParent(); 4126 return true; 4127 } 4128 4129 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4130 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4131 // Is non-HSA path or trap-handler disabled? then, report a warning 4132 // accordingly 4133 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4134 !ST.isTrapHandlerEnabled()) { 4135 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4136 "debugtrap handler not supported", 4137 MI.getDebugLoc(), DS_Warning); 4138 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4139 Ctx.diagnose(NoTrap); 4140 } else { 4141 // Insert debug-trap instruction 4142 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4143 } 4144 4145 MI.eraseFromParent(); 4146 return true; 4147 } 4148 4149 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4150 MachineInstr &MI) const { 4151 MachineIRBuilder &B = Helper.MIRBuilder; 4152 MachineRegisterInfo &MRI = *B.getMRI(); 4153 4154 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4155 auto IntrID = MI.getIntrinsicID(); 4156 switch (IntrID) { 4157 case Intrinsic::amdgcn_if: 4158 case Intrinsic::amdgcn_else: { 4159 MachineInstr *Br = nullptr; 4160 MachineBasicBlock *UncondBrTarget = nullptr; 4161 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4162 const SIRegisterInfo *TRI 4163 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4164 4165 Register Def = MI.getOperand(1).getReg(); 4166 Register Use = MI.getOperand(3).getReg(); 4167 4168 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4169 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4170 if (IntrID == Intrinsic::amdgcn_if) { 4171 B.buildInstr(AMDGPU::SI_IF) 4172 .addDef(Def) 4173 .addUse(Use) 4174 .addMBB(UncondBrTarget); 4175 } else { 4176 B.buildInstr(AMDGPU::SI_ELSE) 4177 .addDef(Def) 4178 .addUse(Use) 4179 .addMBB(UncondBrTarget) 4180 .addImm(0); 4181 } 4182 4183 if (Br) { 4184 Br->getOperand(0).setMBB(CondBrTarget); 4185 } else { 4186 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4187 // since we're swapping branch targets it needs to be reinserted. 4188 // FIXME: IRTranslator should probably not do this 4189 B.buildBr(*CondBrTarget); 4190 } 4191 4192 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4193 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4194 MI.eraseFromParent(); 4195 BrCond->eraseFromParent(); 4196 return true; 4197 } 4198 4199 return false; 4200 } 4201 case Intrinsic::amdgcn_loop: { 4202 MachineInstr *Br = nullptr; 4203 MachineBasicBlock *UncondBrTarget = nullptr; 4204 if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 4205 const SIRegisterInfo *TRI 4206 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4207 4208 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4209 Register Reg = MI.getOperand(2).getReg(); 4210 4211 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4212 B.buildInstr(AMDGPU::SI_LOOP) 4213 .addUse(Reg) 4214 .addMBB(UncondBrTarget); 4215 4216 if (Br) 4217 Br->getOperand(0).setMBB(CondBrTarget); 4218 else 4219 B.buildBr(*CondBrTarget); 4220 4221 MI.eraseFromParent(); 4222 BrCond->eraseFromParent(); 4223 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4224 return true; 4225 } 4226 4227 return false; 4228 } 4229 case Intrinsic::amdgcn_kernarg_segment_ptr: 4230 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4231 // This only makes sense to call in a kernel, so just lower to null. 4232 B.buildConstant(MI.getOperand(0).getReg(), 0); 4233 MI.eraseFromParent(); 4234 return true; 4235 } 4236 4237 return legalizePreloadedArgIntrin( 4238 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4239 case Intrinsic::amdgcn_implicitarg_ptr: 4240 return legalizeImplicitArgPtr(MI, MRI, B); 4241 case Intrinsic::amdgcn_workitem_id_x: 4242 return legalizePreloadedArgIntrin(MI, MRI, B, 4243 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4244 case Intrinsic::amdgcn_workitem_id_y: 4245 return legalizePreloadedArgIntrin(MI, MRI, B, 4246 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4247 case Intrinsic::amdgcn_workitem_id_z: 4248 return legalizePreloadedArgIntrin(MI, MRI, B, 4249 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4250 case Intrinsic::amdgcn_workgroup_id_x: 4251 return legalizePreloadedArgIntrin(MI, MRI, B, 4252 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4253 case Intrinsic::amdgcn_workgroup_id_y: 4254 return legalizePreloadedArgIntrin(MI, MRI, B, 4255 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4256 case Intrinsic::amdgcn_workgroup_id_z: 4257 return legalizePreloadedArgIntrin(MI, MRI, B, 4258 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4259 case Intrinsic::amdgcn_dispatch_ptr: 4260 return legalizePreloadedArgIntrin(MI, MRI, B, 4261 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4262 case Intrinsic::amdgcn_queue_ptr: 4263 return legalizePreloadedArgIntrin(MI, MRI, B, 4264 AMDGPUFunctionArgInfo::QUEUE_PTR); 4265 case Intrinsic::amdgcn_implicit_buffer_ptr: 4266 return legalizePreloadedArgIntrin( 4267 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4268 case Intrinsic::amdgcn_dispatch_id: 4269 return legalizePreloadedArgIntrin(MI, MRI, B, 4270 AMDGPUFunctionArgInfo::DISPATCH_ID); 4271 case Intrinsic::amdgcn_fdiv_fast: 4272 return legalizeFDIVFastIntrin(MI, MRI, B); 4273 case Intrinsic::amdgcn_is_shared: 4274 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4275 case Intrinsic::amdgcn_is_private: 4276 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4277 case Intrinsic::amdgcn_wavefrontsize: { 4278 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4279 MI.eraseFromParent(); 4280 return true; 4281 } 4282 case Intrinsic::amdgcn_s_buffer_load: 4283 return legalizeSBufferLoad(MI, B, Helper.Observer); 4284 case Intrinsic::amdgcn_raw_buffer_store: 4285 case Intrinsic::amdgcn_struct_buffer_store: 4286 return legalizeBufferStore(MI, MRI, B, false, false); 4287 case Intrinsic::amdgcn_raw_buffer_store_format: 4288 case Intrinsic::amdgcn_struct_buffer_store_format: 4289 return legalizeBufferStore(MI, MRI, B, false, true); 4290 case Intrinsic::amdgcn_raw_tbuffer_store: 4291 case Intrinsic::amdgcn_struct_tbuffer_store: 4292 return legalizeBufferStore(MI, MRI, B, true, true); 4293 case Intrinsic::amdgcn_raw_buffer_load: 4294 case Intrinsic::amdgcn_struct_buffer_load: 4295 return legalizeBufferLoad(MI, MRI, B, false, false); 4296 case Intrinsic::amdgcn_raw_buffer_load_format: 4297 case Intrinsic::amdgcn_struct_buffer_load_format: 4298 return legalizeBufferLoad(MI, MRI, B, true, false); 4299 case Intrinsic::amdgcn_raw_tbuffer_load: 4300 case Intrinsic::amdgcn_struct_tbuffer_load: 4301 return legalizeBufferLoad(MI, MRI, B, true, true); 4302 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4303 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4304 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4305 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4306 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4307 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4308 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4309 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4310 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4311 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4312 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4313 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4314 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4315 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4316 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4317 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4318 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4319 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4320 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4321 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4322 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4323 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4324 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4325 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4326 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4327 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4328 return legalizeBufferAtomic(MI, B, IntrID); 4329 case Intrinsic::amdgcn_atomic_inc: 4330 return legalizeAtomicIncDec(MI, B, true); 4331 case Intrinsic::amdgcn_atomic_dec: 4332 return legalizeAtomicIncDec(MI, B, false); 4333 case Intrinsic::trap: 4334 return legalizeTrapIntrinsic(MI, MRI, B); 4335 case Intrinsic::debugtrap: 4336 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4337 default: { 4338 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4339 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4340 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4341 return true; 4342 } 4343 } 4344 4345 return true; 4346 } 4347