1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 21 #include "SIInstrInfo.h" 22 #include "SIMachineFunctionInfo.h" 23 #include "SIRegisterInfo.h" 24 #include "Utils/AMDGPUBaseInfo.h" 25 #include "llvm/ADT/ScopeExit.h" 26 #include "llvm/BinaryFormat/ELF.h" 27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 31 #include "llvm/CodeGen/GlobalISel/Utils.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/IntrinsicsAMDGPU.h" 35 #include "llvm/IR/IntrinsicsR600.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Hack until load/store selection patterns support any tuple of legal types. 46 static cl::opt<bool> EnableNewLegality( 47 "amdgpu-global-isel-new-legality", 48 cl::desc("Use GlobalISel desired legality, rather than try to use" 49 "rules compatible with selection patterns"), 50 cl::init(false), 51 cl::ReallyHidden); 52 53 static constexpr unsigned MaxRegisterSize = 1024; 54 55 // Round the number of elements to the next power of two elements 56 static LLT getPow2VectorType(LLT Ty) { 57 unsigned NElts = Ty.getNumElements(); 58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 60 } 61 62 // Round the number of bits to the next power of two bits 63 static LLT getPow2ScalarType(LLT Ty) { 64 unsigned Bits = Ty.getSizeInBits(); 65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 66 return LLT::scalar(Pow2Bits); 67 } 68 69 /// \returns true if this is an odd sized vector which should widen by adding an 70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71 /// excludes s1 vectors, which should always be scalarized. 72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 if (!Ty.isVector()) 76 return false; 77 78 const LLT EltTy = Ty.getElementType(); 79 const unsigned EltSize = EltTy.getSizeInBits(); 80 return Ty.getNumElements() % 2 != 0 && 81 EltSize > 1 && EltSize < 32 && 82 Ty.getSizeInBits() % 32 != 0; 83 }; 84 } 85 86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 return Ty.getSizeInBits() % 32 == 0; 90 }; 91 } 92 93 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getScalarType(); 97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 98 }; 99 } 100 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 return std::pair(TypeIdx, 106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 107 }; 108 } 109 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 111 return [=](const LegalityQuery &Query) { 112 const LLT Ty = Query.Types[TypeIdx]; 113 const LLT EltTy = Ty.getElementType(); 114 unsigned Size = Ty.getSizeInBits(); 115 unsigned Pieces = (Size + 63) / 64; 116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117 return std::pair(TypeIdx, LLT::scalarOrVector( 118 ElementCount::getFixed(NewNumElts), EltTy)); 119 }; 120 } 121 122 // Increase the number of vector elements to reach the next multiple of 32-bit 123 // type. 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 125 return [=](const LegalityQuery &Query) { 126 const LLT Ty = Query.Types[TypeIdx]; 127 128 const LLT EltTy = Ty.getElementType(); 129 const int Size = Ty.getSizeInBits(); 130 const int EltSize = EltTy.getSizeInBits(); 131 const int NextMul32 = (Size + 31) / 32; 132 133 assert(EltSize < 32); 134 135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 137 }; 138 } 139 140 // Increase the number of vector elements to reach the next legal RegClass. 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 142 return [=](const LegalityQuery &Query) { 143 const LLT Ty = Query.Types[TypeIdx]; 144 const unsigned NumElts = Ty.getNumElements(); 145 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 146 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 147 148 assert(EltSize == 32 || EltSize == 64); 149 assert(Ty.getSizeInBits() < MaxRegisterSize); 150 151 unsigned NewNumElts; 152 // Find the nearest legal RegClass that is larger than the current type. 153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 155 break; 156 } 157 158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 159 }; 160 } 161 162 static LLT getBufferRsrcScalarType(const LLT Ty) { 163 if (!Ty.isVector()) 164 return LLT::scalar(128); 165 const ElementCount NumElems = Ty.getElementCount(); 166 return LLT::vector(NumElems, LLT::scalar(128)); 167 } 168 169 static LLT getBufferRsrcRegisterType(const LLT Ty) { 170 if (!Ty.isVector()) 171 return LLT::fixed_vector(4, LLT::scalar(32)); 172 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 174 } 175 176 static LLT getBitcastRegisterType(const LLT Ty) { 177 const unsigned Size = Ty.getSizeInBits(); 178 179 if (Size <= 32) { 180 // <2 x s8> -> s16 181 // <4 x s8> -> s32 182 return LLT::scalar(Size); 183 } 184 185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186 } 187 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189 return [=](const LegalityQuery &Query) { 190 const LLT Ty = Query.Types[TypeIdx]; 191 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192 }; 193 } 194 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196 return [=](const LegalityQuery &Query) { 197 const LLT Ty = Query.Types[TypeIdx]; 198 unsigned Size = Ty.getSizeInBits(); 199 assert(Size % 32 == 0); 200 return std::pair( 201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 202 }; 203 } 204 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 206 return [=](const LegalityQuery &Query) { 207 const LLT QueryTy = Query.Types[TypeIdx]; 208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 209 }; 210 } 211 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 213 return [=](const LegalityQuery &Query) { 214 const LLT QueryTy = Query.Types[TypeIdx]; 215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 216 }; 217 } 218 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 223 }; 224 } 225 226 static bool isRegisterSize(unsigned Size) { 227 return Size % 32 == 0 && Size <= MaxRegisterSize; 228 } 229 230 static bool isRegisterVectorElementType(LLT EltTy) { 231 const int EltSize = EltTy.getSizeInBits(); 232 return EltSize == 16 || EltSize % 32 == 0; 233 } 234 235 static bool isRegisterVectorType(LLT Ty) { 236 const int EltSize = Ty.getElementType().getSizeInBits(); 237 return EltSize == 32 || EltSize == 64 || 238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 239 EltSize == 128 || EltSize == 256; 240 } 241 242 static bool isRegisterType(LLT Ty) { 243 if (!isRegisterSize(Ty.getSizeInBits())) 244 return false; 245 246 if (Ty.isVector()) 247 return isRegisterVectorType(Ty); 248 249 return true; 250 } 251 252 // Any combination of 32 or 64-bit elements up the maximum register size, and 253 // multiples of v2s16. 254 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 255 return [=](const LegalityQuery &Query) { 256 return isRegisterType(Query.Types[TypeIdx]); 257 }; 258 } 259 260 // RegisterType that doesn't have a corresponding RegClass. 261 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 262 return [=](const LegalityQuery &Query) { 263 LLT Ty = Query.Types[TypeIdx]; 264 return isRegisterType(Ty) && 265 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 266 }; 267 } 268 269 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 270 return [=](const LegalityQuery &Query) { 271 const LLT QueryTy = Query.Types[TypeIdx]; 272 if (!QueryTy.isVector()) 273 return false; 274 const LLT EltTy = QueryTy.getElementType(); 275 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 276 }; 277 } 278 279 // If we have a truncating store or an extending load with a data size larger 280 // than 32-bits, we need to reduce to a 32-bit type. 281 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 282 return [=](const LegalityQuery &Query) { 283 const LLT Ty = Query.Types[TypeIdx]; 284 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 286 }; 287 } 288 289 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 290 // handle some operations by just promoting the register during 291 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 292 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 293 bool IsLoad, bool IsAtomic) { 294 switch (AS) { 295 case AMDGPUAS::PRIVATE_ADDRESS: 296 // FIXME: Private element size. 297 return ST.enableFlatScratch() ? 128 : 32; 298 case AMDGPUAS::LOCAL_ADDRESS: 299 return ST.useDS128() ? 128 : 64; 300 case AMDGPUAS::GLOBAL_ADDRESS: 301 case AMDGPUAS::CONSTANT_ADDRESS: 302 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 303 case AMDGPUAS::BUFFER_RESOURCE: 304 // Treat constant and global as identical. SMRD loads are sometimes usable for 305 // global loads (ideally constant address space should be eliminated) 306 // depending on the context. Legality cannot be context dependent, but 307 // RegBankSelect can split the load as necessary depending on the pointer 308 // register bank/uniformity and if the memory is invariant or not written in a 309 // kernel. 310 return IsLoad ? 512 : 128; 311 default: 312 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 313 // if they may alias scratch depending on the subtarget. This needs to be 314 // moved to custom handling to use addressMayBeAccessedAsPrivate 315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 316 } 317 } 318 319 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 320 const LegalityQuery &Query) { 321 const LLT Ty = Query.Types[0]; 322 323 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 324 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 325 326 unsigned RegSize = Ty.getSizeInBits(); 327 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 328 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 329 unsigned AS = Query.Types[1].getAddressSpace(); 330 331 // All of these need to be custom lowered to cast the pointer operand. 332 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 333 return false; 334 335 // Do not handle extending vector loads. 336 if (Ty.isVector() && MemSize != RegSize) 337 return false; 338 339 // TODO: We should be able to widen loads if the alignment is high enough, but 340 // we also need to modify the memory access size. 341 #if 0 342 // Accept widening loads based on alignment. 343 if (IsLoad && MemSize < Size) 344 MemSize = std::max(MemSize, Align); 345 #endif 346 347 // Only 1-byte and 2-byte to 32-bit extloads are valid. 348 if (MemSize != RegSize && RegSize != 32) 349 return false; 350 351 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 352 Query.MMODescrs[0].Ordering != 353 AtomicOrdering::NotAtomic)) 354 return false; 355 356 switch (MemSize) { 357 case 8: 358 case 16: 359 case 32: 360 case 64: 361 case 128: 362 break; 363 case 96: 364 if (!ST.hasDwordx3LoadStores()) 365 return false; 366 break; 367 case 256: 368 case 512: 369 // These may contextually need to be broken down. 370 break; 371 default: 372 return false; 373 } 374 375 assert(RegSize >= MemSize); 376 377 if (AlignBits < MemSize) { 378 const SITargetLowering *TLI = ST.getTargetLowering(); 379 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 380 Align(AlignBits / 8))) 381 return false; 382 } 383 384 return true; 385 } 386 387 // The newer buffer intrinsic forms take their resource arguments as 388 // pointers in address space 8, aka s128 values. However, in order to not break 389 // SelectionDAG, the underlying operations have to continue to take v4i32 390 // arguments. Therefore, we convert resource pointers - or vectors of them 391 // to integer values here. 392 static bool hasBufferRsrcWorkaround(const LLT Ty) { 393 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 394 return true; 395 if (Ty.isVector()) { 396 const LLT ElemTy = Ty.getElementType(); 397 return hasBufferRsrcWorkaround(ElemTy); 398 } 399 return false; 400 } 401 402 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 403 // workaround this. Eventually it should ignore the type for loads and only care 404 // about the size. Return true in cases where we will workaround this for now by 405 // bitcasting. 406 static bool loadStoreBitcastWorkaround(const LLT Ty) { 407 if (EnableNewLegality) 408 return false; 409 410 const unsigned Size = Ty.getSizeInBits(); 411 if (Size <= 64) 412 return false; 413 // Address space 8 pointers get their own workaround. 414 if (hasBufferRsrcWorkaround(Ty)) 415 return false; 416 if (!Ty.isVector()) 417 return true; 418 419 LLT EltTy = Ty.getElementType(); 420 if (EltTy.isPointer()) 421 return true; 422 423 unsigned EltSize = EltTy.getSizeInBits(); 424 return EltSize != 32 && EltSize != 64; 425 } 426 427 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 428 const LLT Ty = Query.Types[0]; 429 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 430 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 431 } 432 433 /// Return true if a load or store of the type should be lowered with a bitcast 434 /// to a different type. 435 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 436 const LLT MemTy) { 437 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 438 const unsigned Size = Ty.getSizeInBits(); 439 if (Size != MemSizeInBits) 440 return Size <= 32 && Ty.isVector(); 441 442 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 443 return true; 444 445 // Don't try to handle bitcasting vector ext loads for now. 446 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 447 (Size <= 32 || isRegisterSize(Size)) && 448 !isRegisterVectorElementType(Ty.getElementType()); 449 } 450 451 /// Return true if we should legalize a load by widening an odd sized memory 452 /// access up to the alignment. Note this case when the memory access itself 453 /// changes, not the size of the result register. 454 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 455 uint64_t AlignInBits, unsigned AddrSpace, 456 unsigned Opcode) { 457 unsigned SizeInBits = MemoryTy.getSizeInBits(); 458 // We don't want to widen cases that are naturally legal. 459 if (isPowerOf2_32(SizeInBits)) 460 return false; 461 462 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 463 // end up widening these for a scalar load during RegBankSelect, if we don't 464 // have 96-bit scalar loads. 465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 466 return false; 467 468 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 469 return false; 470 471 // A load is known dereferenceable up to the alignment, so it's legal to widen 472 // to it. 473 // 474 // TODO: Could check dereferenceable for less aligned cases. 475 unsigned RoundedSize = NextPowerOf2(SizeInBits); 476 if (AlignInBits < RoundedSize) 477 return false; 478 479 // Do not widen if it would introduce a slow unaligned load. 480 const SITargetLowering *TLI = ST.getTargetLowering(); 481 unsigned Fast = 0; 482 return TLI->allowsMisalignedMemoryAccessesImpl( 483 RoundedSize, AddrSpace, Align(AlignInBits / 8), 484 MachineMemOperand::MOLoad, &Fast) && 485 Fast; 486 } 487 488 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 489 unsigned Opcode) { 490 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 491 return false; 492 493 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 494 Query.MMODescrs[0].AlignInBits, 495 Query.Types[1].getAddressSpace(), Opcode); 496 } 497 498 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 499 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 500 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 501 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 502 MachineRegisterInfo &MRI, unsigned Idx) { 503 MachineOperand &MO = MI.getOperand(Idx); 504 505 const LLT PointerTy = MRI.getType(MO.getReg()); 506 507 // Paranoidly prevent us from doing this multiple times. 508 if (!hasBufferRsrcWorkaround(PointerTy)) 509 return PointerTy; 510 511 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 512 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 513 if (!PointerTy.isVector()) { 514 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 515 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 516 const LLT S32 = LLT::scalar(32); 517 518 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 519 std::array<Register, 4> VectorElems; 520 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 521 for (unsigned I = 0; I < NumParts; ++I) 522 VectorElems[I] = 523 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 524 B.buildMergeValues(MO, VectorElems); 525 MO.setReg(VectorReg); 526 return VectorTy; 527 } 528 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 529 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 530 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 531 B.buildIntToPtr(MO, Scalar); 532 MO.setReg(BitcastReg); 533 534 return VectorTy; 535 } 536 537 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 538 /// the form in which the value must be in order to be passed to the low-level 539 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 540 /// needed in order to account for the fact that we can't define a register 541 /// class for s128 without breaking SelectionDAG. 542 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 543 MachineRegisterInfo &MRI = *B.getMRI(); 544 const LLT PointerTy = MRI.getType(Pointer); 545 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 546 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 547 548 if (!PointerTy.isVector()) { 549 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 550 SmallVector<Register, 4> PointerParts; 551 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 552 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 553 for (unsigned I = 0; I < NumParts; ++I) 554 PointerParts.push_back(Unmerged.getReg(I)); 555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 556 } 557 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 558 return B.buildBitcast(VectorTy, Scalar).getReg(0); 559 } 560 561 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 562 unsigned Idx) { 563 MachineOperand &MO = MI.getOperand(Idx); 564 565 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 566 // Paranoidly prevent us from doing this multiple times. 567 if (!hasBufferRsrcWorkaround(PointerTy)) 568 return; 569 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 570 } 571 572 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 573 const GCNTargetMachine &TM) 574 : ST(ST_) { 575 using namespace TargetOpcode; 576 577 auto GetAddrSpacePtr = [&TM](unsigned AS) { 578 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 579 }; 580 581 const LLT S1 = LLT::scalar(1); 582 const LLT S8 = LLT::scalar(8); 583 const LLT S16 = LLT::scalar(16); 584 const LLT S32 = LLT::scalar(32); 585 const LLT S64 = LLT::scalar(64); 586 const LLT S128 = LLT::scalar(128); 587 const LLT S256 = LLT::scalar(256); 588 const LLT S512 = LLT::scalar(512); 589 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 590 591 const LLT V2S8 = LLT::fixed_vector(2, 8); 592 const LLT V2S16 = LLT::fixed_vector(2, 16); 593 const LLT V4S16 = LLT::fixed_vector(4, 16); 594 595 const LLT V2S32 = LLT::fixed_vector(2, 32); 596 const LLT V3S32 = LLT::fixed_vector(3, 32); 597 const LLT V4S32 = LLT::fixed_vector(4, 32); 598 const LLT V5S32 = LLT::fixed_vector(5, 32); 599 const LLT V6S32 = LLT::fixed_vector(6, 32); 600 const LLT V7S32 = LLT::fixed_vector(7, 32); 601 const LLT V8S32 = LLT::fixed_vector(8, 32); 602 const LLT V9S32 = LLT::fixed_vector(9, 32); 603 const LLT V10S32 = LLT::fixed_vector(10, 32); 604 const LLT V11S32 = LLT::fixed_vector(11, 32); 605 const LLT V12S32 = LLT::fixed_vector(12, 32); 606 const LLT V13S32 = LLT::fixed_vector(13, 32); 607 const LLT V14S32 = LLT::fixed_vector(14, 32); 608 const LLT V15S32 = LLT::fixed_vector(15, 32); 609 const LLT V16S32 = LLT::fixed_vector(16, 32); 610 const LLT V32S32 = LLT::fixed_vector(32, 32); 611 612 const LLT V2S64 = LLT::fixed_vector(2, 64); 613 const LLT V3S64 = LLT::fixed_vector(3, 64); 614 const LLT V4S64 = LLT::fixed_vector(4, 64); 615 const LLT V5S64 = LLT::fixed_vector(5, 64); 616 const LLT V6S64 = LLT::fixed_vector(6, 64); 617 const LLT V7S64 = LLT::fixed_vector(7, 64); 618 const LLT V8S64 = LLT::fixed_vector(8, 64); 619 const LLT V16S64 = LLT::fixed_vector(16, 64); 620 621 std::initializer_list<LLT> AllS32Vectors = 622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 624 std::initializer_list<LLT> AllS64Vectors = 625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 626 627 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 628 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 629 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 630 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 631 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 632 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 633 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 634 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 635 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 636 const LLT BufferStridedPtr = 637 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 638 639 const LLT CodePtr = FlatPtr; 640 641 const std::initializer_list<LLT> AddrSpaces64 = { 642 GlobalPtr, ConstantPtr, FlatPtr 643 }; 644 645 const std::initializer_list<LLT> AddrSpaces32 = { 646 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 647 }; 648 649 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 650 651 const std::initializer_list<LLT> FPTypesBase = { 652 S32, S64 653 }; 654 655 const std::initializer_list<LLT> FPTypes16 = { 656 S32, S64, S16 657 }; 658 659 const std::initializer_list<LLT> FPTypesPK16 = { 660 S32, S64, S16, V2S16 661 }; 662 663 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 664 665 // s1 for VCC branches, s32 for SCC branches. 666 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 667 668 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 669 // elements for v3s16 670 getActionDefinitionsBuilder(G_PHI) 671 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 672 .legalFor(AllS32Vectors) 673 .legalFor(AllS64Vectors) 674 .legalFor(AddrSpaces64) 675 .legalFor(AddrSpaces32) 676 .legalFor(AddrSpaces128) 677 .legalIf(isPointer(0)) 678 .clampScalar(0, S16, S256) 679 .widenScalarToNextPow2(0, 32) 680 .clampMaxNumElements(0, S32, 16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .scalarize(0); 683 684 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 685 // Full set of gfx9 features. 686 if (ST.hasScalarAddSub64()) { 687 getActionDefinitionsBuilder({G_ADD, G_SUB}) 688 .legalFor({S64, S32, S16, V2S16}) 689 .clampMaxNumElementsStrict(0, S16, 2) 690 .scalarize(0) 691 .minScalar(0, S16) 692 .widenScalarToNextMultipleOf(0, 32) 693 .maxScalar(0, S32); 694 } else { 695 getActionDefinitionsBuilder({G_ADD, G_SUB}) 696 .legalFor({S32, S16, V2S16}) 697 .clampMaxNumElementsStrict(0, S16, 2) 698 .scalarize(0) 699 .minScalar(0, S16) 700 .widenScalarToNextMultipleOf(0, 32) 701 .maxScalar(0, S32); 702 } 703 704 if (ST.hasScalarSMulU64()) { 705 getActionDefinitionsBuilder(G_MUL) 706 .legalFor({S64, S32, S16, V2S16}) 707 .clampMaxNumElementsStrict(0, S16, 2) 708 .scalarize(0) 709 .minScalar(0, S16) 710 .widenScalarToNextMultipleOf(0, 32) 711 .custom(); 712 } else { 713 getActionDefinitionsBuilder(G_MUL) 714 .legalFor({S32, S16, V2S16}) 715 .clampMaxNumElementsStrict(0, S16, 2) 716 .scalarize(0) 717 .minScalar(0, S16) 718 .widenScalarToNextMultipleOf(0, 32) 719 .custom(); 720 } 721 assert(ST.hasMad64_32()); 722 723 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 724 .legalFor({S32, S16, V2S16}) // Clamp modifier 725 .minScalarOrElt(0, S16) 726 .clampMaxNumElementsStrict(0, S16, 2) 727 .scalarize(0) 728 .widenScalarToNextPow2(0, 32) 729 .lower(); 730 } else if (ST.has16BitInsts()) { 731 getActionDefinitionsBuilder({G_ADD, G_SUB}) 732 .legalFor({S32, S16}) 733 .minScalar(0, S16) 734 .widenScalarToNextMultipleOf(0, 32) 735 .maxScalar(0, S32) 736 .scalarize(0); 737 738 getActionDefinitionsBuilder(G_MUL) 739 .legalFor({S32, S16}) 740 .scalarize(0) 741 .minScalar(0, S16) 742 .widenScalarToNextMultipleOf(0, 32) 743 .custom(); 744 assert(ST.hasMad64_32()); 745 746 // Technically the saturating operations require clamp bit support, but this 747 // was introduced at the same time as 16-bit operations. 748 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 749 .legalFor({S32, S16}) // Clamp modifier 750 .minScalar(0, S16) 751 .scalarize(0) 752 .widenScalarToNextPow2(0, 16) 753 .lower(); 754 755 // We're just lowering this, but it helps get a better result to try to 756 // coerce to the desired type first. 757 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 758 .minScalar(0, S16) 759 .scalarize(0) 760 .lower(); 761 } else { 762 getActionDefinitionsBuilder({G_ADD, G_SUB}) 763 .legalFor({S32}) 764 .widenScalarToNextMultipleOf(0, 32) 765 .clampScalar(0, S32, S32) 766 .scalarize(0); 767 768 auto &Mul = getActionDefinitionsBuilder(G_MUL) 769 .legalFor({S32}) 770 .scalarize(0) 771 .minScalar(0, S32) 772 .widenScalarToNextMultipleOf(0, 32); 773 774 if (ST.hasMad64_32()) 775 Mul.custom(); 776 else 777 Mul.maxScalar(0, S32); 778 779 if (ST.hasIntClamp()) { 780 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 781 .legalFor({S32}) // Clamp modifier. 782 .scalarize(0) 783 .minScalarOrElt(0, S32) 784 .lower(); 785 } else { 786 // Clamp bit support was added in VI, along with 16-bit operations. 787 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 788 .minScalar(0, S32) 789 .scalarize(0) 790 .lower(); 791 } 792 793 // FIXME: DAG expansion gets better results. The widening uses the smaller 794 // range values and goes for the min/max lowering directly. 795 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 796 .minScalar(0, S32) 797 .scalarize(0) 798 .lower(); 799 } 800 801 getActionDefinitionsBuilder( 802 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 803 .customFor({S32, S64}) 804 .clampScalar(0, S32, S64) 805 .widenScalarToNextPow2(0, 32) 806 .scalarize(0); 807 808 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 809 .legalFor({S32}) 810 .maxScalar(0, S32); 811 812 if (ST.hasVOP3PInsts()) { 813 Mulh 814 .clampMaxNumElements(0, S8, 2) 815 .lowerFor({V2S8}); 816 } 817 818 Mulh 819 .scalarize(0) 820 .lower(); 821 822 // Report legal for any types we can handle anywhere. For the cases only legal 823 // on the SALU, RegBankSelect will be able to re-legalize. 824 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 825 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 826 .clampScalar(0, S32, S64) 827 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 828 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 829 .widenScalarToNextPow2(0) 830 .scalarize(0); 831 832 getActionDefinitionsBuilder( 833 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 834 .legalFor({{S32, S1}, {S32, S32}}) 835 .clampScalar(0, S32, S32) 836 .scalarize(0); 837 838 getActionDefinitionsBuilder(G_BITCAST) 839 // Don't worry about the size constraint. 840 .legalIf(all(isRegisterType(0), isRegisterType(1))) 841 .lower(); 842 843 844 getActionDefinitionsBuilder(G_CONSTANT) 845 .legalFor({S1, S32, S64, S16, GlobalPtr, 846 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 847 .legalIf(isPointer(0)) 848 .clampScalar(0, S32, S64) 849 .widenScalarToNextPow2(0); 850 851 getActionDefinitionsBuilder(G_FCONSTANT) 852 .legalFor({S32, S64, S16}) 853 .clampScalar(0, S16, S64); 854 855 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 856 .legalIf(isRegisterType(0)) 857 // s1 and s16 are special cases because they have legal operations on 858 // them, but don't really occupy registers in the normal way. 859 .legalFor({S1, S16}) 860 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 861 .clampScalarOrElt(0, S32, MaxScalar) 862 .widenScalarToNextPow2(0, 32) 863 .clampMaxNumElements(0, S32, 16); 864 865 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 866 867 // If the amount is divergent, we have to do a wave reduction to get the 868 // maximum value, so this is expanded during RegBankSelect. 869 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 870 .legalFor({{PrivatePtr, S32}}); 871 872 getActionDefinitionsBuilder(G_STACKSAVE) 873 .customFor({PrivatePtr}); 874 getActionDefinitionsBuilder(G_STACKRESTORE) 875 .legalFor({PrivatePtr}); 876 877 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 878 .customIf(typeIsNot(0, PrivatePtr)); 879 880 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 881 882 auto &FPOpActions = getActionDefinitionsBuilder( 883 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 884 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 885 .legalFor({S32, S64}); 886 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 887 .customFor({S32, S64}); 888 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 889 .customFor({S32, S64}); 890 891 if (ST.has16BitInsts()) { 892 if (ST.hasVOP3PInsts()) 893 FPOpActions.legalFor({S16, V2S16}); 894 else 895 FPOpActions.legalFor({S16}); 896 897 TrigActions.customFor({S16}); 898 FDIVActions.customFor({S16}); 899 } 900 901 if (ST.hasPackedFP32Ops()) { 902 FPOpActions.legalFor({V2S32}); 903 FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 904 } 905 906 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 907 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 908 909 if (ST.hasVOP3PInsts()) { 910 MinNumMaxNum.customFor(FPTypesPK16) 911 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 912 .clampMaxNumElements(0, S16, 2) 913 .clampScalar(0, S16, S64) 914 .scalarize(0); 915 } else if (ST.has16BitInsts()) { 916 MinNumMaxNum.customFor(FPTypes16) 917 .clampScalar(0, S16, S64) 918 .scalarize(0); 919 } else { 920 MinNumMaxNum.customFor(FPTypesBase) 921 .clampScalar(0, S32, S64) 922 .scalarize(0); 923 } 924 925 if (ST.hasVOP3PInsts()) 926 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 927 928 FPOpActions 929 .scalarize(0) 930 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 931 932 TrigActions 933 .scalarize(0) 934 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 935 936 FDIVActions 937 .scalarize(0) 938 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 939 940 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 941 .legalFor(FPTypesPK16) 942 .clampMaxNumElementsStrict(0, S16, 2) 943 .scalarize(0) 944 .clampScalar(0, S16, S64); 945 946 if (ST.has16BitInsts()) { 947 getActionDefinitionsBuilder(G_FSQRT) 948 .legalFor({S16}) 949 .customFor({S32, S64}) 950 .scalarize(0) 951 .unsupported(); 952 getActionDefinitionsBuilder(G_FFLOOR) 953 .legalFor({S32, S64, S16}) 954 .scalarize(0) 955 .clampScalar(0, S16, S64); 956 957 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 958 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 959 .scalarize(0) 960 .maxScalarIf(typeIs(0, S16), 1, S16) 961 .clampScalar(1, S32, S32) 962 .lower(); 963 964 getActionDefinitionsBuilder(G_FFREXP) 965 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 966 .scalarize(0) 967 .lower(); 968 } else { 969 getActionDefinitionsBuilder(G_FSQRT) 970 .customFor({S32, S64, S16}) 971 .scalarize(0) 972 .unsupported(); 973 974 975 if (ST.hasFractBug()) { 976 getActionDefinitionsBuilder(G_FFLOOR) 977 .customFor({S64}) 978 .legalFor({S32, S64}) 979 .scalarize(0) 980 .clampScalar(0, S32, S64); 981 } else { 982 getActionDefinitionsBuilder(G_FFLOOR) 983 .legalFor({S32, S64}) 984 .scalarize(0) 985 .clampScalar(0, S32, S64); 986 } 987 988 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 989 .legalFor({{S32, S32}, {S64, S32}}) 990 .scalarize(0) 991 .clampScalar(0, S32, S64) 992 .clampScalar(1, S32, S32) 993 .lower(); 994 995 getActionDefinitionsBuilder(G_FFREXP) 996 .customFor({{S32, S32}, {S64, S32}}) 997 .scalarize(0) 998 .minScalar(0, S32) 999 .clampScalar(1, S32, S32) 1000 .lower(); 1001 } 1002 1003 getActionDefinitionsBuilder(G_FPTRUNC) 1004 .legalFor({{S32, S64}, {S16, S32}}) 1005 .scalarize(0) 1006 .lower(); 1007 1008 getActionDefinitionsBuilder(G_FPEXT) 1009 .legalFor({{S64, S32}, {S32, S16}}) 1010 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 1011 .scalarize(0); 1012 1013 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 1014 if (ST.has16BitInsts()) { 1015 FSubActions 1016 // Use actual fsub instruction 1017 .legalFor({S32, S16}) 1018 // Must use fadd + fneg 1019 .lowerFor({S64, V2S16}); 1020 } else { 1021 FSubActions 1022 // Use actual fsub instruction 1023 .legalFor({S32}) 1024 // Must use fadd + fneg 1025 .lowerFor({S64, S16, V2S16}); 1026 } 1027 1028 FSubActions 1029 .scalarize(0) 1030 .clampScalar(0, S32, S64); 1031 1032 // Whether this is legal depends on the floating point mode for the function. 1033 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 1034 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 1035 FMad.customFor({S32, S16}); 1036 else if (ST.hasMadMacF32Insts()) 1037 FMad.customFor({S32}); 1038 else if (ST.hasMadF16()) 1039 FMad.customFor({S16}); 1040 FMad.scalarize(0) 1041 .lower(); 1042 1043 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1044 if (ST.has16BitInsts()) { 1045 FRem.customFor({S16, S32, S64}); 1046 } else { 1047 FRem.minScalar(0, S32) 1048 .customFor({S32, S64}); 1049 } 1050 FRem.scalarize(0); 1051 1052 // TODO: Do we need to clamp maximum bitwidth? 1053 getActionDefinitionsBuilder(G_TRUNC) 1054 .legalIf(isScalar(0)) 1055 .legalFor({{V2S16, V2S32}}) 1056 .clampMaxNumElements(0, S16, 2) 1057 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1058 // situations (like an invalid implicit use), we don't want to infinite loop 1059 // in the legalizer. 1060 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1061 .alwaysLegal(); 1062 1063 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1064 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1065 {S32, S1}, {S64, S1}, {S16, S1}}) 1066 .scalarize(0) 1067 .clampScalar(0, S32, S64) 1068 .widenScalarToNextPow2(1, 32); 1069 1070 // TODO: Split s1->s64 during regbankselect for VALU. 1071 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1072 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1073 .lowerIf(typeIs(1, S1)) 1074 .customFor({{S32, S64}, {S64, S64}}); 1075 if (ST.has16BitInsts()) 1076 IToFP.legalFor({{S16, S16}}); 1077 IToFP.clampScalar(1, S32, S64) 1078 .minScalar(0, S32) 1079 .scalarize(0) 1080 .widenScalarToNextPow2(1); 1081 1082 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1083 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1084 .customFor({{S64, S32}, {S64, S64}}) 1085 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1086 if (ST.has16BitInsts()) 1087 FPToI.legalFor({{S16, S16}}); 1088 else 1089 FPToI.minScalar(1, S32); 1090 1091 FPToI.minScalar(0, S32) 1092 .widenScalarToNextPow2(0, 32) 1093 .scalarize(0) 1094 .lower(); 1095 1096 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1097 .customFor({S16, S32}) 1098 .scalarize(0) 1099 .lower(); 1100 1101 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 1102 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1103 .scalarize(0) 1104 .lower(); 1105 1106 if (ST.has16BitInsts()) { 1107 getActionDefinitionsBuilder( 1108 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1109 .legalFor({S16, S32, S64}) 1110 .clampScalar(0, S16, S64) 1111 .scalarize(0); 1112 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1113 getActionDefinitionsBuilder( 1114 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1115 .legalFor({S32, S64}) 1116 .clampScalar(0, S32, S64) 1117 .scalarize(0); 1118 } else { 1119 getActionDefinitionsBuilder( 1120 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1121 .legalFor({S32}) 1122 .customFor({S64}) 1123 .clampScalar(0, S32, S64) 1124 .scalarize(0); 1125 } 1126 1127 getActionDefinitionsBuilder(G_PTR_ADD) 1128 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1129 .legalIf(all(isPointer(0), sameSize(0, 1))) 1130 .scalarize(0) 1131 .scalarSameSizeAs(1, 0); 1132 1133 getActionDefinitionsBuilder(G_PTRMASK) 1134 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1135 .scalarSameSizeAs(1, 0) 1136 .scalarize(0); 1137 1138 auto &CmpBuilder = 1139 getActionDefinitionsBuilder(G_ICMP) 1140 // The compare output type differs based on the register bank of the output, 1141 // so make both s1 and s32 legal. 1142 // 1143 // Scalar compares producing output in scc will be promoted to s32, as that 1144 // is the allocatable register type that will be needed for the copy from 1145 // scc. This will be promoted during RegBankSelect, and we assume something 1146 // before that won't try to use s32 result types. 1147 // 1148 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1149 // bank. 1150 .legalForCartesianProduct( 1151 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1152 .legalForCartesianProduct( 1153 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1154 if (ST.has16BitInsts()) { 1155 CmpBuilder.legalFor({{S1, S16}}); 1156 } 1157 1158 CmpBuilder 1159 .widenScalarToNextPow2(1) 1160 .clampScalar(1, S32, S64) 1161 .scalarize(0) 1162 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1163 1164 auto &FCmpBuilder = 1165 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 1166 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 1167 1168 if (ST.hasSALUFloatInsts()) 1169 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 1170 1171 FCmpBuilder 1172 .widenScalarToNextPow2(1) 1173 .clampScalar(1, S32, S64) 1174 .scalarize(0); 1175 1176 // FIXME: fpow has a selection pattern that should move to custom lowering. 1177 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1178 if (ST.has16BitInsts()) 1179 ExpOps.customFor({{S32}, {S16}}); 1180 else 1181 ExpOps.customFor({S32}); 1182 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1183 .scalarize(0); 1184 1185 getActionDefinitionsBuilder(G_FPOWI) 1186 .clampScalar(0, MinScalarFPTy, S32) 1187 .lower(); 1188 1189 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1190 Log2Ops.customFor({S32}); 1191 if (ST.has16BitInsts()) 1192 Log2Ops.legalFor({S16}); 1193 else 1194 Log2Ops.customFor({S16}); 1195 Log2Ops.scalarize(0) 1196 .lower(); 1197 1198 auto &LogOps = 1199 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 1200 LogOps.customFor({S32, S16}); 1201 LogOps.clampScalar(0, MinScalarFPTy, S32) 1202 .scalarize(0); 1203 1204 // The 64-bit versions produce 32-bit results, but only on the SALU. 1205 getActionDefinitionsBuilder(G_CTPOP) 1206 .legalFor({{S32, S32}, {S32, S64}}) 1207 .clampScalar(0, S32, S32) 1208 .widenScalarToNextPow2(1, 32) 1209 .clampScalar(1, S32, S64) 1210 .scalarize(0) 1211 .widenScalarToNextPow2(0, 32); 1212 1213 // If no 16 bit instr is available, lower into different instructions. 1214 if (ST.has16BitInsts()) 1215 getActionDefinitionsBuilder(G_IS_FPCLASS) 1216 .legalForCartesianProduct({S1}, FPTypes16) 1217 .widenScalarToNextPow2(1) 1218 .scalarize(0) 1219 .lower(); 1220 else 1221 getActionDefinitionsBuilder(G_IS_FPCLASS) 1222 .legalForCartesianProduct({S1}, FPTypesBase) 1223 .lowerFor({S1, S16}) 1224 .widenScalarToNextPow2(1) 1225 .scalarize(0) 1226 .lower(); 1227 1228 // The hardware instructions return a different result on 0 than the generic 1229 // instructions expect. The hardware produces -1, but these produce the 1230 // bitwidth. 1231 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1232 .scalarize(0) 1233 .clampScalar(0, S32, S32) 1234 .clampScalar(1, S32, S64) 1235 .widenScalarToNextPow2(0, 32) 1236 .widenScalarToNextPow2(1, 32) 1237 .custom(); 1238 1239 // The 64-bit versions produce 32-bit results, but only on the SALU. 1240 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 1241 .legalFor({{S32, S32}, {S32, S64}}) 1242 .clampScalar(0, S32, S32) 1243 .clampScalar(1, S32, S64) 1244 .scalarize(0) 1245 .widenScalarToNextPow2(0, 32) 1246 .widenScalarToNextPow2(1, 32); 1247 1248 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1249 // RegBankSelect. 1250 getActionDefinitionsBuilder(G_BITREVERSE) 1251 .legalFor({S32, S64}) 1252 .clampScalar(0, S32, S64) 1253 .scalarize(0) 1254 .widenScalarToNextPow2(0); 1255 1256 if (ST.has16BitInsts()) { 1257 getActionDefinitionsBuilder(G_BSWAP) 1258 .legalFor({S16, S32, V2S16}) 1259 .clampMaxNumElementsStrict(0, S16, 2) 1260 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1261 // narrowScalar limitation. 1262 .widenScalarToNextPow2(0) 1263 .clampScalar(0, S16, S32) 1264 .scalarize(0); 1265 1266 if (ST.hasVOP3PInsts()) { 1267 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1268 .legalFor({S32, S16, V2S16}) 1269 .clampMaxNumElements(0, S16, 2) 1270 .minScalar(0, S16) 1271 .widenScalarToNextPow2(0) 1272 .scalarize(0) 1273 .lower(); 1274 } else { 1275 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1276 .legalFor({S32, S16}) 1277 .widenScalarToNextPow2(0) 1278 .minScalar(0, S16) 1279 .scalarize(0) 1280 .lower(); 1281 } 1282 } else { 1283 // TODO: Should have same legality without v_perm_b32 1284 getActionDefinitionsBuilder(G_BSWAP) 1285 .legalFor({S32}) 1286 .lowerIf(scalarNarrowerThan(0, 32)) 1287 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1288 // narrowScalar limitation. 1289 .widenScalarToNextPow2(0) 1290 .maxScalar(0, S32) 1291 .scalarize(0) 1292 .lower(); 1293 1294 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1295 .legalFor({S32}) 1296 .minScalar(0, S32) 1297 .widenScalarToNextPow2(0) 1298 .scalarize(0) 1299 .lower(); 1300 } 1301 1302 getActionDefinitionsBuilder(G_INTTOPTR) 1303 // List the common cases 1304 .legalForCartesianProduct(AddrSpaces64, {S64}) 1305 .legalForCartesianProduct(AddrSpaces32, {S32}) 1306 .scalarize(0) 1307 // Accept any address space as long as the size matches 1308 .legalIf(sameSize(0, 1)) 1309 .widenScalarIf(smallerThan(1, 0), 1310 [](const LegalityQuery &Query) { 1311 return std::pair( 1312 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1313 }) 1314 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1315 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1316 }); 1317 1318 getActionDefinitionsBuilder(G_PTRTOINT) 1319 // List the common cases 1320 .legalForCartesianProduct(AddrSpaces64, {S64}) 1321 .legalForCartesianProduct(AddrSpaces32, {S32}) 1322 .scalarize(0) 1323 // Accept any address space as long as the size matches 1324 .legalIf(sameSize(0, 1)) 1325 .widenScalarIf(smallerThan(0, 1), 1326 [](const LegalityQuery &Query) { 1327 return std::pair( 1328 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1329 }) 1330 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1331 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1332 }); 1333 1334 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1335 .scalarize(0) 1336 .custom(); 1337 1338 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1339 bool IsLoad) -> bool { 1340 const LLT DstTy = Query.Types[0]; 1341 1342 // Split vector extloads. 1343 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1344 1345 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1346 return true; 1347 1348 const LLT PtrTy = Query.Types[1]; 1349 unsigned AS = PtrTy.getAddressSpace(); 1350 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1351 Query.MMODescrs[0].Ordering != 1352 AtomicOrdering::NotAtomic)) 1353 return true; 1354 1355 // Catch weird sized loads that don't evenly divide into the access sizes 1356 // TODO: May be able to widen depending on alignment etc. 1357 unsigned NumRegs = (MemSize + 31) / 32; 1358 if (NumRegs == 3) { 1359 if (!ST.hasDwordx3LoadStores()) 1360 return true; 1361 } else { 1362 // If the alignment allows, these should have been widened. 1363 if (!isPowerOf2_32(NumRegs)) 1364 return true; 1365 } 1366 1367 return false; 1368 }; 1369 1370 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1371 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1372 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1373 1374 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1375 // LDS 1376 // TODO: Unsupported flat for SI. 1377 1378 for (unsigned Op : {G_LOAD, G_STORE}) { 1379 const bool IsStore = Op == G_STORE; 1380 1381 auto &Actions = getActionDefinitionsBuilder(Op); 1382 // Explicitly list some common cases. 1383 // TODO: Does this help compile time at all? 1384 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1385 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1386 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1387 {S64, GlobalPtr, S64, GlobalAlign32}, 1388 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1389 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1390 {S32, GlobalPtr, S8, GlobalAlign8}, 1391 {S32, GlobalPtr, S16, GlobalAlign16}, 1392 1393 {S32, LocalPtr, S32, 32}, 1394 {S64, LocalPtr, S64, 32}, 1395 {V2S32, LocalPtr, V2S32, 32}, 1396 {S32, LocalPtr, S8, 8}, 1397 {S32, LocalPtr, S16, 16}, 1398 {V2S16, LocalPtr, S32, 32}, 1399 1400 {S32, PrivatePtr, S32, 32}, 1401 {S32, PrivatePtr, S8, 8}, 1402 {S32, PrivatePtr, S16, 16}, 1403 {V2S16, PrivatePtr, S32, 32}, 1404 1405 {S32, ConstantPtr, S32, GlobalAlign32}, 1406 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1407 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1408 {S64, ConstantPtr, S64, GlobalAlign32}, 1409 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1410 Actions.legalIf( 1411 [=](const LegalityQuery &Query) -> bool { 1412 return isLoadStoreLegal(ST, Query); 1413 }); 1414 1415 // The custom pointers (fat pointers, buffer resources) don't work with load 1416 // and store at this level. Fat pointers should have been lowered to 1417 // intrinsics before the translation to MIR. 1418 Actions.unsupportedIf( 1419 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 1420 1421 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1422 // ptrtoint. This is needed to account for the fact that we can't have i128 1423 // as a register class for SelectionDAG reasons. 1424 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1425 return hasBufferRsrcWorkaround(Query.Types[0]); 1426 }); 1427 1428 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1429 // 64-bits. 1430 // 1431 // TODO: Should generalize bitcast action into coerce, which will also cover 1432 // inserting addrspacecasts. 1433 Actions.customIf(typeIs(1, Constant32Ptr)); 1434 1435 // Turn any illegal element vectors into something easier to deal 1436 // with. These will ultimately produce 32-bit scalar shifts to extract the 1437 // parts anyway. 1438 // 1439 // For odd 16-bit element vectors, prefer to split those into pieces with 1440 // 16-bit vector parts. 1441 Actions.bitcastIf( 1442 [=](const LegalityQuery &Query) -> bool { 1443 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1444 Query.MMODescrs[0].MemoryTy); 1445 }, bitcastToRegisterType(0)); 1446 1447 if (!IsStore) { 1448 // Widen suitably aligned loads by loading extra bytes. The standard 1449 // legalization actions can't properly express widening memory operands. 1450 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1451 return shouldWidenLoad(ST, Query, G_LOAD); 1452 }); 1453 } 1454 1455 // FIXME: load/store narrowing should be moved to lower action 1456 Actions 1457 .narrowScalarIf( 1458 [=](const LegalityQuery &Query) -> bool { 1459 return !Query.Types[0].isVector() && 1460 needToSplitMemOp(Query, Op == G_LOAD); 1461 }, 1462 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1463 const LLT DstTy = Query.Types[0]; 1464 const LLT PtrTy = Query.Types[1]; 1465 1466 const unsigned DstSize = DstTy.getSizeInBits(); 1467 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1468 1469 // Split extloads. 1470 if (DstSize > MemSize) 1471 return std::pair(0, LLT::scalar(MemSize)); 1472 1473 unsigned MaxSize = maxSizeForAddrSpace( 1474 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1475 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1476 if (MemSize > MaxSize) 1477 return std::pair(0, LLT::scalar(MaxSize)); 1478 1479 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1480 return std::pair(0, LLT::scalar(Align)); 1481 }) 1482 .fewerElementsIf( 1483 [=](const LegalityQuery &Query) -> bool { 1484 return Query.Types[0].isVector() && 1485 needToSplitMemOp(Query, Op == G_LOAD); 1486 }, 1487 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1488 const LLT DstTy = Query.Types[0]; 1489 const LLT PtrTy = Query.Types[1]; 1490 1491 LLT EltTy = DstTy.getElementType(); 1492 unsigned MaxSize = maxSizeForAddrSpace( 1493 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1494 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1495 1496 // FIXME: Handle widened to power of 2 results better. This ends 1497 // up scalarizing. 1498 // FIXME: 3 element stores scalarized on SI 1499 1500 // Split if it's too large for the address space. 1501 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1502 if (MemSize > MaxSize) { 1503 unsigned NumElts = DstTy.getNumElements(); 1504 unsigned EltSize = EltTy.getSizeInBits(); 1505 1506 if (MaxSize % EltSize == 0) { 1507 return std::pair( 1508 0, LLT::scalarOrVector( 1509 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1510 } 1511 1512 unsigned NumPieces = MemSize / MaxSize; 1513 1514 // FIXME: Refine when odd breakdowns handled 1515 // The scalars will need to be re-legalized. 1516 if (NumPieces == 1 || NumPieces >= NumElts || 1517 NumElts % NumPieces != 0) 1518 return std::pair(0, EltTy); 1519 1520 return std::pair(0, 1521 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1522 } 1523 1524 // FIXME: We could probably handle weird extending loads better. 1525 if (DstTy.getSizeInBits() > MemSize) 1526 return std::pair(0, EltTy); 1527 1528 unsigned EltSize = EltTy.getSizeInBits(); 1529 unsigned DstSize = DstTy.getSizeInBits(); 1530 if (!isPowerOf2_32(DstSize)) { 1531 // We're probably decomposing an odd sized store. Try to split 1532 // to the widest type. TODO: Account for alignment. As-is it 1533 // should be OK, since the new parts will be further legalized. 1534 unsigned FloorSize = llvm::bit_floor(DstSize); 1535 return std::pair( 1536 0, LLT::scalarOrVector( 1537 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1538 } 1539 1540 // May need relegalization for the scalars. 1541 return std::pair(0, EltTy); 1542 }) 1543 .minScalar(0, S32) 1544 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1545 .widenScalarToNextPow2(0) 1546 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1547 .lower(); 1548 } 1549 1550 // FIXME: Unaligned accesses not lowered. 1551 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1552 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1553 {S32, GlobalPtr, S16, 2 * 8}, 1554 {S32, LocalPtr, S8, 8}, 1555 {S32, LocalPtr, S16, 16}, 1556 {S32, PrivatePtr, S8, 8}, 1557 {S32, PrivatePtr, S16, 16}, 1558 {S32, ConstantPtr, S8, 8}, 1559 {S32, ConstantPtr, S16, 2 * 8}}) 1560 .legalIf( 1561 [=](const LegalityQuery &Query) -> bool { 1562 return isLoadStoreLegal(ST, Query); 1563 }); 1564 1565 if (ST.hasFlatAddressSpace()) { 1566 ExtLoads.legalForTypesWithMemDesc( 1567 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1568 } 1569 1570 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1571 // 64-bits. 1572 // 1573 // TODO: Should generalize bitcast action into coerce, which will also cover 1574 // inserting addrspacecasts. 1575 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1576 1577 ExtLoads.clampScalar(0, S32, S32) 1578 .widenScalarToNextPow2(0) 1579 .lower(); 1580 1581 auto &Atomics = getActionDefinitionsBuilder( 1582 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1583 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1584 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1585 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1586 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1587 {S64, GlobalPtr}, {S64, LocalPtr}, 1588 {S32, RegionPtr}, {S64, RegionPtr}}); 1589 if (ST.hasFlatAddressSpace()) { 1590 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1591 } 1592 1593 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1594 if (ST.hasLDSFPAtomicAdd()) { 1595 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1596 if (ST.hasGFX90AInsts()) 1597 Atomic.legalFor({{S64, LocalPtr}}); 1598 if (ST.hasAtomicDsPkAdd16Insts()) 1599 Atomic.legalFor({{V2S16, LocalPtr}}); 1600 } 1601 if (ST.hasAtomicFaddInsts()) 1602 Atomic.legalFor({{S32, GlobalPtr}}); 1603 if (ST.hasFlatAtomicFaddF32Inst()) 1604 Atomic.legalFor({{S32, FlatPtr}}); 1605 1606 if (ST.hasGFX90AInsts()) { 1607 // These are legal with some caveats, and should have undergone expansion in 1608 // the IR in most situations 1609 // TODO: Move atomic expansion into legalizer 1610 Atomic.legalFor({ 1611 {S32, GlobalPtr}, 1612 {S64, GlobalPtr}, 1613 {S64, FlatPtr} 1614 }); 1615 } 1616 1617 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1618 // demarshalling 1619 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1620 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1621 {S32, FlatPtr}, {S64, FlatPtr}}) 1622 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1623 {S32, RegionPtr}, {S64, RegionPtr}}); 1624 // TODO: Pointer types, any 32-bit or 64-bit vector 1625 1626 // Condition should be s32 for scalar, s1 for vector. 1627 getActionDefinitionsBuilder(G_SELECT) 1628 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1629 LocalPtr, FlatPtr, PrivatePtr, 1630 LLT::fixed_vector(2, LocalPtr), 1631 LLT::fixed_vector(2, PrivatePtr)}, 1632 {S1, S32}) 1633 .clampScalar(0, S16, S64) 1634 .scalarize(1) 1635 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1636 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1637 .clampMaxNumElements(0, S32, 2) 1638 .clampMaxNumElements(0, LocalPtr, 2) 1639 .clampMaxNumElements(0, PrivatePtr, 2) 1640 .scalarize(0) 1641 .widenScalarToNextPow2(0) 1642 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1643 1644 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1645 // be more flexible with the shift amount type. 1646 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1647 .legalFor({{S32, S32}, {S64, S32}}); 1648 if (ST.has16BitInsts()) { 1649 if (ST.hasVOP3PInsts()) { 1650 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1651 .clampMaxNumElements(0, S16, 2); 1652 } else 1653 Shifts.legalFor({{S16, S16}}); 1654 1655 // TODO: Support 16-bit shift amounts for all types 1656 Shifts.widenScalarIf( 1657 [=](const LegalityQuery &Query) { 1658 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1659 // 32-bit amount. 1660 const LLT ValTy = Query.Types[0]; 1661 const LLT AmountTy = Query.Types[1]; 1662 return ValTy.getSizeInBits() <= 16 && 1663 AmountTy.getSizeInBits() < 16; 1664 }, changeTo(1, S16)); 1665 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1666 Shifts.clampScalar(1, S32, S32); 1667 Shifts.widenScalarToNextPow2(0, 16); 1668 Shifts.clampScalar(0, S16, S64); 1669 1670 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1671 .minScalar(0, S16) 1672 .scalarize(0) 1673 .lower(); 1674 } else { 1675 // Make sure we legalize the shift amount type first, as the general 1676 // expansion for the shifted type will produce much worse code if it hasn't 1677 // been truncated already. 1678 Shifts.clampScalar(1, S32, S32); 1679 Shifts.widenScalarToNextPow2(0, 32); 1680 Shifts.clampScalar(0, S32, S64); 1681 1682 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1683 .minScalar(0, S32) 1684 .scalarize(0) 1685 .lower(); 1686 } 1687 Shifts.scalarize(0); 1688 1689 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1690 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1691 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1692 unsigned IdxTypeIdx = 2; 1693 1694 getActionDefinitionsBuilder(Op) 1695 .customIf([=](const LegalityQuery &Query) { 1696 const LLT EltTy = Query.Types[EltTypeIdx]; 1697 const LLT VecTy = Query.Types[VecTypeIdx]; 1698 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1699 const unsigned EltSize = EltTy.getSizeInBits(); 1700 const bool isLegalVecType = 1701 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1702 // Address space 8 pointers are 128-bit wide values, but the logic 1703 // below will try to bitcast them to 2N x s64, which will fail. 1704 // Therefore, as an intermediate step, wrap extracts/insertions from a 1705 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1706 // extraction result) in order to produce a vector operation that can 1707 // be handled by the logic below. 1708 if (EltTy.isPointer() && EltSize > 64) 1709 return true; 1710 return (EltSize == 32 || EltSize == 64) && 1711 VecTy.getSizeInBits() % 32 == 0 && 1712 VecTy.getSizeInBits() <= MaxRegisterSize && 1713 IdxTy.getSizeInBits() == 32 && 1714 isLegalVecType; 1715 }) 1716 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1717 bitcastToVectorElement32(VecTypeIdx)) 1718 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1719 .bitcastIf( 1720 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1721 [=](const LegalityQuery &Query) { 1722 // For > 64-bit element types, try to turn this into a 64-bit 1723 // element vector since we may be able to do better indexing 1724 // if this is scalar. If not, fall back to 32. 1725 const LLT EltTy = Query.Types[EltTypeIdx]; 1726 const LLT VecTy = Query.Types[VecTypeIdx]; 1727 const unsigned DstEltSize = EltTy.getSizeInBits(); 1728 const unsigned VecSize = VecTy.getSizeInBits(); 1729 1730 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1731 return std::pair( 1732 VecTypeIdx, 1733 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1734 }) 1735 .clampScalar(EltTypeIdx, S32, S64) 1736 .clampScalar(VecTypeIdx, S32, S64) 1737 .clampScalar(IdxTypeIdx, S32, S32) 1738 .clampMaxNumElements(VecTypeIdx, S32, 32) 1739 // TODO: Clamp elements for 64-bit vectors? 1740 .moreElementsIf( 1741 isIllegalRegisterType(VecTypeIdx), 1742 moreElementsToNextExistingRegClass(VecTypeIdx)) 1743 // It should only be necessary with variable indexes. 1744 // As a last resort, lower to the stack 1745 .lower(); 1746 } 1747 1748 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1749 .unsupportedIf([=](const LegalityQuery &Query) { 1750 const LLT &EltTy = Query.Types[1].getElementType(); 1751 return Query.Types[0] != EltTy; 1752 }); 1753 1754 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1755 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1756 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1757 1758 // FIXME: Doesn't handle extract of illegal sizes. 1759 getActionDefinitionsBuilder(Op) 1760 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1761 .lowerIf([=](const LegalityQuery &Query) { 1762 // Sub-vector(or single element) insert and extract. 1763 // TODO: verify immediate offset here since lower only works with 1764 // whole elements. 1765 const LLT BigTy = Query.Types[BigTyIdx]; 1766 return BigTy.isVector(); 1767 }) 1768 // FIXME: Multiples of 16 should not be legal. 1769 .legalIf([=](const LegalityQuery &Query) { 1770 const LLT BigTy = Query.Types[BigTyIdx]; 1771 const LLT LitTy = Query.Types[LitTyIdx]; 1772 return (BigTy.getSizeInBits() % 32 == 0) && 1773 (LitTy.getSizeInBits() % 16 == 0); 1774 }) 1775 .widenScalarIf( 1776 [=](const LegalityQuery &Query) { 1777 const LLT BigTy = Query.Types[BigTyIdx]; 1778 return (BigTy.getScalarSizeInBits() < 16); 1779 }, 1780 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1781 .widenScalarIf( 1782 [=](const LegalityQuery &Query) { 1783 const LLT LitTy = Query.Types[LitTyIdx]; 1784 return (LitTy.getScalarSizeInBits() < 16); 1785 }, 1786 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1787 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1788 .widenScalarToNextPow2(BigTyIdx, 32); 1789 1790 } 1791 1792 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1793 .legalForCartesianProduct(AllS32Vectors, {S32}) 1794 .legalForCartesianProduct(AllS64Vectors, {S64}) 1795 .clampNumElements(0, V16S32, V32S32) 1796 .clampNumElements(0, V2S64, V16S64) 1797 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1798 .moreElementsIf( 1799 isIllegalRegisterType(0), 1800 moreElementsToNextExistingRegClass(0)); 1801 1802 if (ST.hasScalarPackInsts()) { 1803 BuildVector 1804 // FIXME: Should probably widen s1 vectors straight to s32 1805 .minScalarOrElt(0, S16) 1806 .minScalar(1, S16); 1807 1808 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1809 .legalFor({V2S16, S32}) 1810 .lower(); 1811 } else { 1812 BuildVector.customFor({V2S16, S16}); 1813 BuildVector.minScalarOrElt(0, S32); 1814 1815 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1816 .customFor({V2S16, S32}) 1817 .lower(); 1818 } 1819 1820 BuildVector.legalIf(isRegisterType(0)); 1821 1822 // FIXME: Clamp maximum size 1823 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1824 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1825 .clampMaxNumElements(0, S32, 32) 1826 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1827 .clampMaxNumElements(0, S16, 64); 1828 1829 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1830 1831 // Merge/Unmerge 1832 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1833 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1834 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1835 1836 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1837 const LLT Ty = Query.Types[TypeIdx]; 1838 if (Ty.isVector()) { 1839 const LLT &EltTy = Ty.getElementType(); 1840 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1841 return true; 1842 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1843 return true; 1844 } 1845 return false; 1846 }; 1847 1848 auto &Builder = getActionDefinitionsBuilder(Op) 1849 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1850 .lowerFor({{S16, V2S16}}) 1851 .lowerIf([=](const LegalityQuery &Query) { 1852 const LLT BigTy = Query.Types[BigTyIdx]; 1853 return BigTy.getSizeInBits() == 32; 1854 }) 1855 // Try to widen to s16 first for small types. 1856 // TODO: Only do this on targets with legal s16 shifts 1857 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1858 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1860 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1861 elementTypeIs(1, S16)), 1862 changeTo(1, V2S16)) 1863 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1864 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1865 // valid. 1866 .clampScalar(LitTyIdx, S32, S512) 1867 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1868 // Break up vectors with weird elements into scalars 1869 .fewerElementsIf( 1870 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1871 scalarize(0)) 1872 .fewerElementsIf( 1873 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1874 scalarize(1)) 1875 .clampScalar(BigTyIdx, S32, MaxScalar); 1876 1877 if (Op == G_MERGE_VALUES) { 1878 Builder.widenScalarIf( 1879 // TODO: Use 16-bit shifts if legal for 8-bit values? 1880 [=](const LegalityQuery &Query) { 1881 const LLT Ty = Query.Types[LitTyIdx]; 1882 return Ty.getSizeInBits() < 32; 1883 }, 1884 changeTo(LitTyIdx, S32)); 1885 } 1886 1887 Builder.widenScalarIf( 1888 [=](const LegalityQuery &Query) { 1889 const LLT Ty = Query.Types[BigTyIdx]; 1890 return Ty.getSizeInBits() % 16 != 0; 1891 }, 1892 [=](const LegalityQuery &Query) { 1893 // Pick the next power of 2, or a multiple of 64 over 128. 1894 // Whichever is smaller. 1895 const LLT &Ty = Query.Types[BigTyIdx]; 1896 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1897 if (NewSizeInBits >= 256) { 1898 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1899 if (RoundedTo < NewSizeInBits) 1900 NewSizeInBits = RoundedTo; 1901 } 1902 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1903 }) 1904 // Any vectors left are the wrong size. Scalarize them. 1905 .scalarize(0) 1906 .scalarize(1); 1907 } 1908 1909 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1910 // RegBankSelect. 1911 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1912 .legalFor({{S32}, {S64}}); 1913 1914 if (ST.hasVOP3PInsts()) { 1915 SextInReg.lowerFor({{V2S16}}) 1916 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1917 // get more vector shift opportunities, since we'll get those when 1918 // expanded. 1919 .clampMaxNumElementsStrict(0, S16, 2); 1920 } else if (ST.has16BitInsts()) { 1921 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1922 } else { 1923 // Prefer to promote to s32 before lowering if we don't have 16-bit 1924 // shifts. This avoid a lot of intermediate truncate and extend operations. 1925 SextInReg.lowerFor({{S32}, {S64}}); 1926 } 1927 1928 SextInReg 1929 .scalarize(0) 1930 .clampScalar(0, S32, S64) 1931 .lower(); 1932 1933 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1934 .scalarize(0) 1935 .lower(); 1936 1937 // TODO: Only Try to form v2s16 with legal packed instructions. 1938 getActionDefinitionsBuilder(G_FSHR) 1939 .legalFor({{S32, S32}}) 1940 .lowerFor({{V2S16, V2S16}}) 1941 .clampMaxNumElementsStrict(0, S16, 2) 1942 .scalarize(0) 1943 .lower(); 1944 1945 if (ST.hasVOP3PInsts()) { 1946 getActionDefinitionsBuilder(G_FSHL) 1947 .lowerFor({{V2S16, V2S16}}) 1948 .clampMaxNumElementsStrict(0, S16, 2) 1949 .scalarize(0) 1950 .lower(); 1951 } else { 1952 getActionDefinitionsBuilder(G_FSHL) 1953 .scalarize(0) 1954 .lower(); 1955 } 1956 1957 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1958 .legalFor({S64}); 1959 1960 getActionDefinitionsBuilder(G_FENCE) 1961 .alwaysLegal(); 1962 1963 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1964 .scalarize(0) 1965 .minScalar(0, S32) 1966 .lower(); 1967 1968 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1969 .legalFor({{S32, S32}, {S64, S32}}) 1970 .clampScalar(1, S32, S32) 1971 .clampScalar(0, S32, S64) 1972 .widenScalarToNextPow2(0) 1973 .scalarize(0); 1974 1975 getActionDefinitionsBuilder( 1976 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 1977 G_FCOPYSIGN, 1978 1979 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 1980 G_READ_REGISTER, G_WRITE_REGISTER, 1981 1982 G_SADDO, G_SSUBO}) 1983 .lower(); 1984 1985 if (ST.hasIEEEMinMax()) { 1986 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 1987 .legalFor(FPTypesPK16) 1988 .clampMaxNumElements(0, S16, 2) 1989 .scalarize(0); 1990 } else { 1991 // TODO: Implement 1992 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 1993 } 1994 1995 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1996 .lower(); 1997 1998 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1999 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2000 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2001 .unsupported(); 2002 2003 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 2004 2005 getLegacyLegalizerInfo().computeTables(); 2006 verify(*ST.getInstrInfo()); 2007 } 2008 2009 bool AMDGPULegalizerInfo::legalizeCustom( 2010 LegalizerHelper &Helper, MachineInstr &MI, 2011 LostDebugLocObserver &LocObserver) const { 2012 MachineIRBuilder &B = Helper.MIRBuilder; 2013 MachineRegisterInfo &MRI = *B.getMRI(); 2014 2015 switch (MI.getOpcode()) { 2016 case TargetOpcode::G_ADDRSPACE_CAST: 2017 return legalizeAddrSpaceCast(MI, MRI, B); 2018 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2019 return legalizeFroundeven(MI, MRI, B); 2020 case TargetOpcode::G_FCEIL: 2021 return legalizeFceil(MI, MRI, B); 2022 case TargetOpcode::G_FREM: 2023 return legalizeFrem(MI, MRI, B); 2024 case TargetOpcode::G_INTRINSIC_TRUNC: 2025 return legalizeIntrinsicTrunc(MI, MRI, B); 2026 case TargetOpcode::G_SITOFP: 2027 return legalizeITOFP(MI, MRI, B, true); 2028 case TargetOpcode::G_UITOFP: 2029 return legalizeITOFP(MI, MRI, B, false); 2030 case TargetOpcode::G_FPTOSI: 2031 return legalizeFPTOI(MI, MRI, B, true); 2032 case TargetOpcode::G_FPTOUI: 2033 return legalizeFPTOI(MI, MRI, B, false); 2034 case TargetOpcode::G_FMINNUM: 2035 case TargetOpcode::G_FMAXNUM: 2036 case TargetOpcode::G_FMINNUM_IEEE: 2037 case TargetOpcode::G_FMAXNUM_IEEE: 2038 return legalizeMinNumMaxNum(Helper, MI); 2039 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2040 return legalizeExtractVectorElt(MI, MRI, B); 2041 case TargetOpcode::G_INSERT_VECTOR_ELT: 2042 return legalizeInsertVectorElt(MI, MRI, B); 2043 case TargetOpcode::G_FSIN: 2044 case TargetOpcode::G_FCOS: 2045 return legalizeSinCos(MI, MRI, B); 2046 case TargetOpcode::G_GLOBAL_VALUE: 2047 return legalizeGlobalValue(MI, MRI, B); 2048 case TargetOpcode::G_LOAD: 2049 case TargetOpcode::G_SEXTLOAD: 2050 case TargetOpcode::G_ZEXTLOAD: 2051 return legalizeLoad(Helper, MI); 2052 case TargetOpcode::G_STORE: 2053 return legalizeStore(Helper, MI); 2054 case TargetOpcode::G_FMAD: 2055 return legalizeFMad(MI, MRI, B); 2056 case TargetOpcode::G_FDIV: 2057 return legalizeFDIV(MI, MRI, B); 2058 case TargetOpcode::G_FFREXP: 2059 return legalizeFFREXP(MI, MRI, B); 2060 case TargetOpcode::G_FSQRT: 2061 return legalizeFSQRT(MI, MRI, B); 2062 case TargetOpcode::G_UDIV: 2063 case TargetOpcode::G_UREM: 2064 case TargetOpcode::G_UDIVREM: 2065 return legalizeUnsignedDIV_REM(MI, MRI, B); 2066 case TargetOpcode::G_SDIV: 2067 case TargetOpcode::G_SREM: 2068 case TargetOpcode::G_SDIVREM: 2069 return legalizeSignedDIV_REM(MI, MRI, B); 2070 case TargetOpcode::G_ATOMIC_CMPXCHG: 2071 return legalizeAtomicCmpXChg(MI, MRI, B); 2072 case TargetOpcode::G_FLOG2: 2073 return legalizeFlog2(MI, B); 2074 case TargetOpcode::G_FLOG: 2075 case TargetOpcode::G_FLOG10: 2076 return legalizeFlogCommon(MI, B); 2077 case TargetOpcode::G_FEXP2: 2078 return legalizeFExp2(MI, B); 2079 case TargetOpcode::G_FEXP: 2080 case TargetOpcode::G_FEXP10: 2081 return legalizeFExp(MI, B); 2082 case TargetOpcode::G_FPOW: 2083 return legalizeFPow(MI, B); 2084 case TargetOpcode::G_FFLOOR: 2085 return legalizeFFloor(MI, MRI, B); 2086 case TargetOpcode::G_BUILD_VECTOR: 2087 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2088 return legalizeBuildVector(MI, MRI, B); 2089 case TargetOpcode::G_MUL: 2090 return legalizeMul(Helper, MI); 2091 case TargetOpcode::G_CTLZ: 2092 case TargetOpcode::G_CTTZ: 2093 return legalizeCTLZ_CTTZ(MI, MRI, B); 2094 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 2095 return legalizeFPTruncRound(MI, B); 2096 case TargetOpcode::G_STACKSAVE: 2097 return legalizeStackSave(MI, B); 2098 default: 2099 return false; 2100 } 2101 2102 llvm_unreachable("expected switch to return"); 2103 } 2104 2105 Register AMDGPULegalizerInfo::getSegmentAperture( 2106 unsigned AS, 2107 MachineRegisterInfo &MRI, 2108 MachineIRBuilder &B) const { 2109 MachineFunction &MF = B.getMF(); 2110 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2111 const LLT S32 = LLT::scalar(32); 2112 const LLT S64 = LLT::scalar(64); 2113 2114 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2115 2116 if (ST.hasApertureRegs()) { 2117 // Note: this register is somewhat broken. When used as a 32-bit operand, 2118 // it only returns zeroes. The real value is in the upper 32 bits. 2119 // Thus, we must emit extract the high 32 bits. 2120 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2121 ? AMDGPU::SRC_SHARED_BASE 2122 : AMDGPU::SRC_PRIVATE_BASE; 2123 // FIXME: It would be more natural to emit a COPY here, but then copy 2124 // coalescing would kick in and it would think it's okay to use the "HI" 2125 // subregister (instead of extracting the HI 32 bits) which is an artificial 2126 // (unusable) register. 2127 // Register TableGen definitions would need an overhaul to get rid of the 2128 // artificial "HI" aperture registers and prevent this kind of issue from 2129 // happening. 2130 Register Dst = MRI.createGenericVirtualRegister(S64); 2131 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2132 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2133 return B.buildUnmerge(S32, Dst).getReg(1); 2134 } 2135 2136 // TODO: can we be smarter about machine pointer info? 2137 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2138 Register LoadAddr = MRI.createGenericVirtualRegister( 2139 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2140 // For code object version 5, private_base and shared_base are passed through 2141 // implicit kernargs. 2142 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 2143 AMDGPU::AMDHSA_COV5) { 2144 AMDGPUTargetLowering::ImplicitParameter Param = 2145 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2146 : AMDGPUTargetLowering::PRIVATE_BASE; 2147 uint64_t Offset = 2148 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2149 2150 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2151 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2152 2153 if (!loadInputValue(KernargPtrReg, B, 2154 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2155 return Register(); 2156 2157 MachineMemOperand *MMO = MF.getMachineMemOperand( 2158 PtrInfo, 2159 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2160 MachineMemOperand::MOInvariant, 2161 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2162 2163 // Pointer address 2164 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2165 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2166 // Load address 2167 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2168 } 2169 2170 Register QueuePtr = MRI.createGenericVirtualRegister( 2171 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2172 2173 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2174 return Register(); 2175 2176 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2177 // private_segment_aperture_base_hi. 2178 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2179 2180 MachineMemOperand *MMO = MF.getMachineMemOperand( 2181 PtrInfo, 2182 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2183 MachineMemOperand::MOInvariant, 2184 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2185 2186 B.buildPtrAdd(LoadAddr, QueuePtr, 2187 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2188 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2189 } 2190 2191 /// Return true if the value is a known valid address, such that a null check is 2192 /// not necessary. 2193 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2194 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2195 MachineInstr *Def = MRI.getVRegDef(Val); 2196 switch (Def->getOpcode()) { 2197 case AMDGPU::G_FRAME_INDEX: 2198 case AMDGPU::G_GLOBAL_VALUE: 2199 case AMDGPU::G_BLOCK_ADDR: 2200 return true; 2201 case AMDGPU::G_CONSTANT: { 2202 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2203 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2204 } 2205 default: 2206 return false; 2207 } 2208 2209 return false; 2210 } 2211 2212 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2213 MachineInstr &MI, MachineRegisterInfo &MRI, 2214 MachineIRBuilder &B) const { 2215 MachineFunction &MF = B.getMF(); 2216 2217 const LLT S32 = LLT::scalar(32); 2218 Register Dst = MI.getOperand(0).getReg(); 2219 Register Src = MI.getOperand(1).getReg(); 2220 2221 LLT DstTy = MRI.getType(Dst); 2222 LLT SrcTy = MRI.getType(Src); 2223 unsigned DestAS = DstTy.getAddressSpace(); 2224 unsigned SrcAS = SrcTy.getAddressSpace(); 2225 2226 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2227 // vector element. 2228 assert(!DstTy.isVector()); 2229 2230 const AMDGPUTargetMachine &TM 2231 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2232 2233 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2234 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2235 return true; 2236 } 2237 2238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2239 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2240 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2241 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2242 // Extract low 32-bits of the pointer. 2243 B.buildExtract(Dst, Src, 0); 2244 MI.eraseFromParent(); 2245 return true; 2246 } 2247 2248 unsigned NullVal = TM.getNullPointerValue(DestAS); 2249 2250 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2251 auto FlatNull = B.buildConstant(SrcTy, 0); 2252 2253 // Extract low 32-bits of the pointer. 2254 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2255 2256 auto CmpRes = 2257 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2258 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2259 2260 MI.eraseFromParent(); 2261 return true; 2262 } 2263 2264 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2265 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2266 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2267 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2268 if (!ApertureReg.isValid()) 2269 return false; 2270 2271 // Coerce the type of the low half of the result so we can use merge_values. 2272 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2273 2274 // TODO: Should we allow mismatched types but matching sizes in merges to 2275 // avoid the ptrtoint? 2276 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 2277 2278 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2279 B.buildCopy(Dst, BuildPtr); 2280 MI.eraseFromParent(); 2281 return true; 2282 } 2283 2284 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2285 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2286 2287 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2288 SegmentNull.getReg(0)); 2289 2290 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2291 2292 MI.eraseFromParent(); 2293 return true; 2294 } 2295 2296 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2297 SrcTy.getSizeInBits() == 64) { 2298 // Truncate. 2299 B.buildExtract(Dst, Src, 0); 2300 MI.eraseFromParent(); 2301 return true; 2302 } 2303 2304 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2305 DstTy.getSizeInBits() == 64) { 2306 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2307 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2308 auto PtrLo = B.buildPtrToInt(S32, Src); 2309 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2310 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2311 MI.eraseFromParent(); 2312 return true; 2313 } 2314 2315 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2316 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2317 2318 LLVMContext &Ctx = MF.getFunction().getContext(); 2319 Ctx.diagnose(InvalidAddrSpaceCast); 2320 B.buildUndef(Dst); 2321 MI.eraseFromParent(); 2322 return true; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 2326 MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 Register Src = MI.getOperand(1).getReg(); 2329 LLT Ty = MRI.getType(Src); 2330 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2331 2332 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2333 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2334 2335 auto C1 = B.buildFConstant(Ty, C1Val); 2336 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2337 2338 // TODO: Should this propagate fast-math-flags? 2339 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2340 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2341 2342 auto C2 = B.buildFConstant(Ty, C2Val); 2343 auto Fabs = B.buildFAbs(Ty, Src); 2344 2345 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2346 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2347 MI.eraseFromParent(); 2348 return true; 2349 } 2350 2351 bool AMDGPULegalizerInfo::legalizeFceil( 2352 MachineInstr &MI, MachineRegisterInfo &MRI, 2353 MachineIRBuilder &B) const { 2354 2355 const LLT S1 = LLT::scalar(1); 2356 const LLT S64 = LLT::scalar(64); 2357 2358 Register Src = MI.getOperand(1).getReg(); 2359 assert(MRI.getType(Src) == S64); 2360 2361 // result = trunc(src) 2362 // if (src > 0.0 && src != result) 2363 // result += 1.0 2364 2365 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2366 2367 const auto Zero = B.buildFConstant(S64, 0.0); 2368 const auto One = B.buildFConstant(S64, 1.0); 2369 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2370 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2371 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2372 auto Add = B.buildSelect(S64, And, One, Zero); 2373 2374 // TODO: Should this propagate fast-math-flags? 2375 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2376 MI.eraseFromParent(); 2377 return true; 2378 } 2379 2380 bool AMDGPULegalizerInfo::legalizeFrem( 2381 MachineInstr &MI, MachineRegisterInfo &MRI, 2382 MachineIRBuilder &B) const { 2383 Register DstReg = MI.getOperand(0).getReg(); 2384 Register Src0Reg = MI.getOperand(1).getReg(); 2385 Register Src1Reg = MI.getOperand(2).getReg(); 2386 auto Flags = MI.getFlags(); 2387 LLT Ty = MRI.getType(DstReg); 2388 2389 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2390 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2391 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2392 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2393 MI.eraseFromParent(); 2394 return true; 2395 } 2396 2397 static MachineInstrBuilder extractF64Exponent(Register Hi, 2398 MachineIRBuilder &B) { 2399 const unsigned FractBits = 52; 2400 const unsigned ExpBits = 11; 2401 LLT S32 = LLT::scalar(32); 2402 2403 auto Const0 = B.buildConstant(S32, FractBits - 32); 2404 auto Const1 = B.buildConstant(S32, ExpBits); 2405 2406 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2407 .addUse(Hi) 2408 .addUse(Const0.getReg(0)) 2409 .addUse(Const1.getReg(0)); 2410 2411 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2412 } 2413 2414 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2415 MachineInstr &MI, MachineRegisterInfo &MRI, 2416 MachineIRBuilder &B) const { 2417 const LLT S1 = LLT::scalar(1); 2418 const LLT S32 = LLT::scalar(32); 2419 const LLT S64 = LLT::scalar(64); 2420 2421 Register Src = MI.getOperand(1).getReg(); 2422 assert(MRI.getType(Src) == S64); 2423 2424 // TODO: Should this use extract since the low half is unused? 2425 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2426 Register Hi = Unmerge.getReg(1); 2427 2428 // Extract the upper half, since this is where we will find the sign and 2429 // exponent. 2430 auto Exp = extractF64Exponent(Hi, B); 2431 2432 const unsigned FractBits = 52; 2433 2434 // Extract the sign bit. 2435 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2436 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2437 2438 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2439 2440 const auto Zero32 = B.buildConstant(S32, 0); 2441 2442 // Extend back to 64-bits. 2443 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2444 2445 auto Shr = B.buildAShr(S64, FractMask, Exp); 2446 auto Not = B.buildNot(S64, Shr); 2447 auto Tmp0 = B.buildAnd(S64, Src, Not); 2448 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2449 2450 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2451 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2452 2453 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2454 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2455 MI.eraseFromParent(); 2456 return true; 2457 } 2458 2459 bool AMDGPULegalizerInfo::legalizeITOFP( 2460 MachineInstr &MI, MachineRegisterInfo &MRI, 2461 MachineIRBuilder &B, bool Signed) const { 2462 2463 Register Dst = MI.getOperand(0).getReg(); 2464 Register Src = MI.getOperand(1).getReg(); 2465 2466 const LLT S64 = LLT::scalar(64); 2467 const LLT S32 = LLT::scalar(32); 2468 2469 assert(MRI.getType(Src) == S64); 2470 2471 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2472 auto ThirtyTwo = B.buildConstant(S32, 32); 2473 2474 if (MRI.getType(Dst) == S64) { 2475 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2476 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2477 2478 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2479 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2480 2481 // TODO: Should this propagate fast-math-flags? 2482 B.buildFAdd(Dst, LdExp, CvtLo); 2483 MI.eraseFromParent(); 2484 return true; 2485 } 2486 2487 assert(MRI.getType(Dst) == S32); 2488 2489 auto One = B.buildConstant(S32, 1); 2490 2491 MachineInstrBuilder ShAmt; 2492 if (Signed) { 2493 auto ThirtyOne = B.buildConstant(S32, 31); 2494 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2495 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2496 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2497 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2498 .addUse(Unmerge.getReg(1)); 2499 auto LS2 = B.buildSub(S32, LS, One); 2500 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2501 } else 2502 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2503 auto Norm = B.buildShl(S64, Src, ShAmt); 2504 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2505 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2506 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2507 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2508 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2509 B.buildFLdexp(Dst, FVal, Scale); 2510 MI.eraseFromParent(); 2511 return true; 2512 } 2513 2514 // TODO: Copied from DAG implementation. Verify logic and document how this 2515 // actually works. 2516 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2517 MachineRegisterInfo &MRI, 2518 MachineIRBuilder &B, 2519 bool Signed) const { 2520 2521 Register Dst = MI.getOperand(0).getReg(); 2522 Register Src = MI.getOperand(1).getReg(); 2523 2524 const LLT S64 = LLT::scalar(64); 2525 const LLT S32 = LLT::scalar(32); 2526 2527 const LLT SrcLT = MRI.getType(Src); 2528 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2529 2530 unsigned Flags = MI.getFlags(); 2531 2532 // The basic idea of converting a floating point number into a pair of 32-bit 2533 // integers is illustrated as follows: 2534 // 2535 // tf := trunc(val); 2536 // hif := floor(tf * 2^-32); 2537 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2538 // hi := fptoi(hif); 2539 // lo := fptoi(lof); 2540 // 2541 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2542 MachineInstrBuilder Sign; 2543 if (Signed && SrcLT == S32) { 2544 // However, a 32-bit floating point number has only 23 bits mantissa and 2545 // it's not enough to hold all the significant bits of `lof` if val is 2546 // negative. To avoid the loss of precision, We need to take the absolute 2547 // value after truncating and flip the result back based on the original 2548 // signedness. 2549 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2550 Trunc = B.buildFAbs(S32, Trunc, Flags); 2551 } 2552 MachineInstrBuilder K0, K1; 2553 if (SrcLT == S64) { 2554 K0 = B.buildFConstant( 2555 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2556 K1 = B.buildFConstant( 2557 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2558 } else { 2559 K0 = B.buildFConstant( 2560 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2561 K1 = B.buildFConstant( 2562 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2563 } 2564 2565 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2566 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2567 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2568 2569 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2570 : B.buildFPTOUI(S32, FloorMul); 2571 auto Lo = B.buildFPTOUI(S32, Fma); 2572 2573 if (Signed && SrcLT == S32) { 2574 // Flip the result based on the signedness, which is either all 0s or 1s. 2575 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2576 // r := xor({lo, hi}, sign) - sign; 2577 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2578 Sign); 2579 } else 2580 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2581 MI.eraseFromParent(); 2582 2583 return true; 2584 } 2585 2586 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2587 MachineInstr &MI) const { 2588 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2589 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2590 2591 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2592 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2593 2594 // With ieee_mode disabled, the instructions have the correct behavior 2595 // already for G_FMINNUM/G_FMAXNUM 2596 if (!MFI->getMode().IEEE) 2597 return !IsIEEEOp; 2598 2599 if (IsIEEEOp) 2600 return true; 2601 2602 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2606 MachineInstr &MI, MachineRegisterInfo &MRI, 2607 MachineIRBuilder &B) const { 2608 // TODO: Should move some of this into LegalizerHelper. 2609 2610 // TODO: Promote dynamic indexing of s16 to s32 2611 2612 Register Dst = MI.getOperand(0).getReg(); 2613 Register Vec = MI.getOperand(1).getReg(); 2614 2615 LLT VecTy = MRI.getType(Vec); 2616 LLT EltTy = VecTy.getElementType(); 2617 assert(EltTy == MRI.getType(Dst)); 2618 2619 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2620 // but we can't go directly to that logic becasue you can't bitcast a vector 2621 // of pointers to a vector of integers. Therefore, introduce an intermediate 2622 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2623 // drive the legalization forward. 2624 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2625 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2626 LLT IntVecTy = VecTy.changeElementType(IntTy); 2627 2628 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2629 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2630 B.buildIntToPtr(Dst, IntElt); 2631 2632 MI.eraseFromParent(); 2633 return true; 2634 } 2635 2636 // FIXME: Artifact combiner probably should have replaced the truncated 2637 // constant before this, so we shouldn't need 2638 // getIConstantVRegValWithLookThrough. 2639 std::optional<ValueAndVReg> MaybeIdxVal = 2640 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2641 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2642 return true; 2643 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2644 2645 if (IdxVal < VecTy.getNumElements()) { 2646 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2647 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2648 } else { 2649 B.buildUndef(Dst); 2650 } 2651 2652 MI.eraseFromParent(); 2653 return true; 2654 } 2655 2656 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2657 MachineInstr &MI, MachineRegisterInfo &MRI, 2658 MachineIRBuilder &B) const { 2659 // TODO: Should move some of this into LegalizerHelper. 2660 2661 // TODO: Promote dynamic indexing of s16 to s32 2662 2663 Register Dst = MI.getOperand(0).getReg(); 2664 Register Vec = MI.getOperand(1).getReg(); 2665 Register Ins = MI.getOperand(2).getReg(); 2666 2667 LLT VecTy = MRI.getType(Vec); 2668 LLT EltTy = VecTy.getElementType(); 2669 assert(EltTy == MRI.getType(Ins)); 2670 2671 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2672 // but we can't go directly to that logic becasue you can't bitcast a vector 2673 // of pointers to a vector of integers. Therefore, make the pointer vector 2674 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2675 // new value, and then inttoptr the result vector back. This will then allow 2676 // the rest of legalization to take over. 2677 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2678 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2679 LLT IntVecTy = VecTy.changeElementType(IntTy); 2680 2681 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2682 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2683 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2684 MI.getOperand(3)); 2685 B.buildIntToPtr(Dst, IntVecDest); 2686 MI.eraseFromParent(); 2687 return true; 2688 } 2689 2690 // FIXME: Artifact combiner probably should have replaced the truncated 2691 // constant before this, so we shouldn't need 2692 // getIConstantVRegValWithLookThrough. 2693 std::optional<ValueAndVReg> MaybeIdxVal = 2694 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2695 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2696 return true; 2697 2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2699 2700 unsigned NumElts = VecTy.getNumElements(); 2701 if (IdxVal < NumElts) { 2702 SmallVector<Register, 8> SrcRegs; 2703 for (unsigned i = 0; i < NumElts; ++i) 2704 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2705 B.buildUnmerge(SrcRegs, Vec); 2706 2707 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2708 B.buildMergeLikeInstr(Dst, SrcRegs); 2709 } else { 2710 B.buildUndef(Dst); 2711 } 2712 2713 MI.eraseFromParent(); 2714 return true; 2715 } 2716 2717 bool AMDGPULegalizerInfo::legalizeSinCos( 2718 MachineInstr &MI, MachineRegisterInfo &MRI, 2719 MachineIRBuilder &B) const { 2720 2721 Register DstReg = MI.getOperand(0).getReg(); 2722 Register SrcReg = MI.getOperand(1).getReg(); 2723 LLT Ty = MRI.getType(DstReg); 2724 unsigned Flags = MI.getFlags(); 2725 2726 Register TrigVal; 2727 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2728 if (ST.hasTrigReducedRange()) { 2729 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2730 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 2731 .addUse(MulVal.getReg(0)) 2732 .setMIFlags(Flags) 2733 .getReg(0); 2734 } else 2735 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2736 2737 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2738 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2739 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 2740 .addUse(TrigVal) 2741 .setMIFlags(Flags); 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2747 MachineIRBuilder &B, 2748 const GlobalValue *GV, 2749 int64_t Offset, 2750 unsigned GAFlags) const { 2751 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2752 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2753 // to the following code sequence: 2754 // 2755 // For constant address space: 2756 // s_getpc_b64 s[0:1] 2757 // s_add_u32 s0, s0, $symbol 2758 // s_addc_u32 s1, s1, 0 2759 // 2760 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2761 // a fixup or relocation is emitted to replace $symbol with a literal 2762 // constant, which is a pc-relative offset from the encoding of the $symbol 2763 // operand to the global variable. 2764 // 2765 // For global address space: 2766 // s_getpc_b64 s[0:1] 2767 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2768 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2769 // 2770 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2771 // fixups or relocations are emitted to replace $symbol@*@lo and 2772 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2773 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2774 // operand to the global variable. 2775 2776 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2777 2778 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2779 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2780 2781 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2782 .addDef(PCReg); 2783 2784 MIB.addGlobalAddress(GV, Offset, GAFlags); 2785 if (GAFlags == SIInstrInfo::MO_NONE) 2786 MIB.addImm(0); 2787 else 2788 MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 2789 2790 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2791 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2792 2793 if (PtrTy.getSizeInBits() == 32) 2794 B.buildExtract(DstReg, PCReg, 0); 2795 return true; 2796 } 2797 2798 // Emit a ABS32_LO / ABS32_HI relocation stub. 2799 void AMDGPULegalizerInfo::buildAbsGlobalAddress( 2800 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 2801 MachineRegisterInfo &MRI) const { 2802 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 2803 2804 LLT S32 = LLT::scalar(32); 2805 2806 // Use the destination directly, if and only if we store the lower address 2807 // part only and we don't have a register class being set. 2808 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 2809 ? DstReg 2810 : MRI.createGenericVirtualRegister(S32); 2811 2812 if (!MRI.getRegClassOrNull(AddrLo)) 2813 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 2814 2815 // Write the lower half. 2816 B.buildInstr(AMDGPU::S_MOV_B32) 2817 .addDef(AddrLo) 2818 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 2819 2820 // If required, write the upper half as well. 2821 if (RequiresHighHalf) { 2822 assert(PtrTy.getSizeInBits() == 64 && 2823 "Must provide a 64-bit pointer type!"); 2824 2825 Register AddrHi = MRI.createGenericVirtualRegister(S32); 2826 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 2827 2828 B.buildInstr(AMDGPU::S_MOV_B32) 2829 .addDef(AddrHi) 2830 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 2831 2832 // Use the destination directly, if and only if we don't have a register 2833 // class being set. 2834 Register AddrDst = !MRI.getRegClassOrNull(DstReg) 2835 ? DstReg 2836 : MRI.createGenericVirtualRegister(LLT::scalar(64)); 2837 2838 if (!MRI.getRegClassOrNull(AddrDst)) 2839 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 2840 2841 B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 2842 2843 // If we created a new register for the destination, cast the result into 2844 // the final output. 2845 if (AddrDst != DstReg) 2846 B.buildCast(DstReg, AddrDst); 2847 } else if (AddrLo != DstReg) { 2848 // If we created a new register for the destination, cast the result into 2849 // the final output. 2850 B.buildCast(DstReg, AddrLo); 2851 } 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2855 MachineInstr &MI, MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register DstReg = MI.getOperand(0).getReg(); 2858 LLT Ty = MRI.getType(DstReg); 2859 unsigned AS = Ty.getAddressSpace(); 2860 2861 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2862 MachineFunction &MF = B.getMF(); 2863 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2864 2865 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2866 if (!MFI->isModuleEntryFunction() && 2867 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2868 const Function &Fn = MF.getFunction(); 2869 DiagnosticInfoUnsupported BadLDSDecl( 2870 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2871 DS_Warning); 2872 Fn.getContext().diagnose(BadLDSDecl); 2873 2874 // We currently don't have a way to correctly allocate LDS objects that 2875 // aren't directly associated with a kernel. We do force inlining of 2876 // functions that use local objects. However, if these dead functions are 2877 // not eliminated, we don't want a compile time error. Just emit a warning 2878 // and a trap, since there should be no callable path here. 2879 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>()); 2880 B.buildUndef(DstReg); 2881 MI.eraseFromParent(); 2882 return true; 2883 } 2884 2885 // TODO: We could emit code to handle the initialization somewhere. 2886 // We ignore the initializer for now and legalize it to allow selection. 2887 // The initializer will anyway get errored out during assembly emission. 2888 const SITargetLowering *TLI = ST.getTargetLowering(); 2889 if (!TLI->shouldUseLDSConstAddress(GV)) { 2890 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2891 return true; // Leave in place; 2892 } 2893 2894 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2895 Type *Ty = GV->getValueType(); 2896 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2897 // zero-sized type in other languages to declare the dynamic shared 2898 // memory which size is not known at the compile time. They will be 2899 // allocated by the runtime and placed directly after the static 2900 // allocated ones. They all share the same offset. 2901 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2902 // Adjust alignment for that dynamic shared memory array. 2903 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2904 LLT S32 = LLT::scalar(32); 2905 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 2906 B.buildIntToPtr(DstReg, Sz); 2907 MI.eraseFromParent(); 2908 return true; 2909 } 2910 } 2911 2912 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2913 *cast<GlobalVariable>(GV))); 2914 MI.eraseFromParent(); 2915 return true; 2916 } 2917 2918 if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 2919 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 2920 MI.eraseFromParent(); 2921 return true; 2922 } 2923 2924 const SITargetLowering *TLI = ST.getTargetLowering(); 2925 2926 if (TLI->shouldEmitFixup(GV)) { 2927 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2928 MI.eraseFromParent(); 2929 return true; 2930 } 2931 2932 if (TLI->shouldEmitPCReloc(GV)) { 2933 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2934 MI.eraseFromParent(); 2935 return true; 2936 } 2937 2938 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2939 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2940 2941 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2942 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2943 MachinePointerInfo::getGOT(MF), 2944 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2945 MachineMemOperand::MOInvariant, 2946 LoadTy, Align(8)); 2947 2948 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2949 2950 if (Ty.getSizeInBits() == 32) { 2951 // Truncate if this is a 32-bit constant address. 2952 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2953 B.buildExtract(DstReg, Load, 0); 2954 } else 2955 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2956 2957 MI.eraseFromParent(); 2958 return true; 2959 } 2960 2961 static LLT widenToNextPowerOf2(LLT Ty) { 2962 if (Ty.isVector()) 2963 return Ty.changeElementCount( 2964 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2965 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2966 } 2967 2968 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2969 MachineInstr &MI) const { 2970 MachineIRBuilder &B = Helper.MIRBuilder; 2971 MachineRegisterInfo &MRI = *B.getMRI(); 2972 GISelChangeObserver &Observer = Helper.Observer; 2973 2974 Register PtrReg = MI.getOperand(1).getReg(); 2975 LLT PtrTy = MRI.getType(PtrReg); 2976 unsigned AddrSpace = PtrTy.getAddressSpace(); 2977 2978 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2979 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2980 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2981 Observer.changingInstr(MI); 2982 MI.getOperand(1).setReg(Cast.getReg(0)); 2983 Observer.changedInstr(MI); 2984 return true; 2985 } 2986 2987 if (MI.getOpcode() != AMDGPU::G_LOAD) 2988 return false; 2989 2990 Register ValReg = MI.getOperand(0).getReg(); 2991 LLT ValTy = MRI.getType(ValReg); 2992 2993 if (hasBufferRsrcWorkaround(ValTy)) { 2994 Observer.changingInstr(MI); 2995 castBufferRsrcFromV4I32(MI, B, MRI, 0); 2996 Observer.changedInstr(MI); 2997 return true; 2998 } 2999 3000 MachineMemOperand *MMO = *MI.memoperands_begin(); 3001 const unsigned ValSize = ValTy.getSizeInBits(); 3002 const LLT MemTy = MMO->getMemoryType(); 3003 const Align MemAlign = MMO->getAlign(); 3004 const unsigned MemSize = MemTy.getSizeInBits(); 3005 const uint64_t AlignInBits = 8 * MemAlign.value(); 3006 3007 // Widen non-power-of-2 loads to the alignment if needed 3008 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3009 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3010 3011 // This was already the correct extending load result type, so just adjust 3012 // the memory type. 3013 if (WideMemSize == ValSize) { 3014 MachineFunction &MF = B.getMF(); 3015 3016 MachineMemOperand *WideMMO = 3017 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3018 Observer.changingInstr(MI); 3019 MI.setMemRefs(MF, {WideMMO}); 3020 Observer.changedInstr(MI); 3021 return true; 3022 } 3023 3024 // Don't bother handling edge case that should probably never be produced. 3025 if (ValSize > WideMemSize) 3026 return false; 3027 3028 LLT WideTy = widenToNextPowerOf2(ValTy); 3029 3030 Register WideLoad; 3031 if (!WideTy.isVector()) { 3032 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3033 B.buildTrunc(ValReg, WideLoad).getReg(0); 3034 } else { 3035 // Extract the subvector. 3036 3037 if (isRegisterType(ValTy)) { 3038 // If this a case where G_EXTRACT is legal, use it. 3039 // (e.g. <3 x s32> -> <4 x s32>) 3040 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3041 B.buildExtract(ValReg, WideLoad, 0); 3042 } else { 3043 // For cases where the widened type isn't a nice register value, unmerge 3044 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 3045 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3046 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3047 } 3048 } 3049 3050 MI.eraseFromParent(); 3051 return true; 3052 } 3053 3054 return false; 3055 } 3056 3057 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 3058 MachineInstr &MI) const { 3059 MachineIRBuilder &B = Helper.MIRBuilder; 3060 MachineRegisterInfo &MRI = *B.getMRI(); 3061 GISelChangeObserver &Observer = Helper.Observer; 3062 3063 Register DataReg = MI.getOperand(0).getReg(); 3064 LLT DataTy = MRI.getType(DataReg); 3065 3066 if (hasBufferRsrcWorkaround(DataTy)) { 3067 Observer.changingInstr(MI); 3068 castBufferRsrcArgToV4I32(MI, B, 0); 3069 Observer.changedInstr(MI); 3070 return true; 3071 } 3072 return false; 3073 } 3074 3075 bool AMDGPULegalizerInfo::legalizeFMad( 3076 MachineInstr &MI, MachineRegisterInfo &MRI, 3077 MachineIRBuilder &B) const { 3078 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3079 assert(Ty.isScalar()); 3080 3081 MachineFunction &MF = B.getMF(); 3082 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3083 3084 // TODO: Always legal with future ftz flag. 3085 // FIXME: Do we need just output? 3086 if (Ty == LLT::float32() && 3087 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 3088 return true; 3089 if (Ty == LLT::float16() && 3090 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 3091 return true; 3092 3093 MachineIRBuilder HelperBuilder(MI); 3094 GISelObserverWrapper DummyObserver; 3095 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 3096 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 3097 } 3098 3099 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3100 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3101 Register DstReg = MI.getOperand(0).getReg(); 3102 Register PtrReg = MI.getOperand(1).getReg(); 3103 Register CmpVal = MI.getOperand(2).getReg(); 3104 Register NewVal = MI.getOperand(3).getReg(); 3105 3106 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3107 "this should not have been custom lowered"); 3108 3109 LLT ValTy = MRI.getType(CmpVal); 3110 LLT VecTy = LLT::fixed_vector(2, ValTy); 3111 3112 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3113 3114 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3115 .addDef(DstReg) 3116 .addUse(PtrReg) 3117 .addUse(PackedVal) 3118 .setMemRefs(MI.memoperands()); 3119 3120 MI.eraseFromParent(); 3121 return true; 3122 } 3123 3124 /// Return true if it's known that \p Src can never be an f32 denormal value. 3125 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3126 Register Src) { 3127 const MachineInstr *DefMI = MRI.getVRegDef(Src); 3128 switch (DefMI->getOpcode()) { 3129 case TargetOpcode::G_INTRINSIC: { 3130 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 3131 case Intrinsic::amdgcn_frexp_mant: 3132 return true; 3133 default: 3134 break; 3135 } 3136 3137 break; 3138 } 3139 case TargetOpcode::G_FFREXP: { 3140 if (DefMI->getOperand(0).getReg() == Src) 3141 return true; 3142 break; 3143 } 3144 case TargetOpcode::G_FPEXT: { 3145 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 3146 } 3147 default: 3148 return false; 3149 } 3150 3151 return false; 3152 } 3153 3154 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3155 if (Flags & MachineInstr::FmAfn) 3156 return true; 3157 const auto &Options = MF.getTarget().Options; 3158 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3159 } 3160 3161 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3162 unsigned Flags) { 3163 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3164 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3165 DenormalMode::PreserveSign; 3166 } 3167 3168 std::pair<Register, Register> 3169 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3170 unsigned Flags) const { 3171 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3172 return {}; 3173 3174 const LLT F32 = LLT::scalar(32); 3175 auto SmallestNormal = B.buildFConstant( 3176 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3177 auto IsLtSmallestNormal = 3178 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3179 3180 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3181 auto One = B.buildFConstant(F32, 1.0); 3182 auto ScaleFactor = 3183 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3184 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3185 3186 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3187 } 3188 3189 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3190 MachineIRBuilder &B) const { 3191 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3192 // If we have to handle denormals, scale up the input and adjust the result. 3193 3194 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3195 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3196 3197 Register Dst = MI.getOperand(0).getReg(); 3198 Register Src = MI.getOperand(1).getReg(); 3199 LLT Ty = B.getMRI()->getType(Dst); 3200 unsigned Flags = MI.getFlags(); 3201 3202 if (Ty == LLT::scalar(16)) { 3203 const LLT F32 = LLT::scalar(32); 3204 // Nothing in half is a denormal when promoted to f32. 3205 auto Ext = B.buildFPExt(F32, Src, Flags); 3206 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 3207 .addUse(Ext.getReg(0)) 3208 .setMIFlags(Flags); 3209 B.buildFPTrunc(Dst, Log2, Flags); 3210 MI.eraseFromParent(); 3211 return true; 3212 } 3213 3214 assert(Ty == LLT::scalar(32)); 3215 3216 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3217 if (!ScaledInput) { 3218 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 3219 .addUse(Src) 3220 .setMIFlags(Flags); 3221 MI.eraseFromParent(); 3222 return true; 3223 } 3224 3225 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3226 .addUse(ScaledInput) 3227 .setMIFlags(Flags); 3228 3229 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3230 auto Zero = B.buildFConstant(Ty, 0.0); 3231 auto ResultOffset = 3232 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3233 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3234 3235 MI.eraseFromParent(); 3236 return true; 3237 } 3238 3239 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3240 Register Z, unsigned Flags) { 3241 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3242 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3243 } 3244 3245 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3246 MachineIRBuilder &B) const { 3247 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3248 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3249 3250 MachineRegisterInfo &MRI = *B.getMRI(); 3251 Register Dst = MI.getOperand(0).getReg(); 3252 Register X = MI.getOperand(1).getReg(); 3253 unsigned Flags = MI.getFlags(); 3254 const LLT Ty = MRI.getType(X); 3255 MachineFunction &MF = B.getMF(); 3256 3257 const LLT F32 = LLT::scalar(32); 3258 const LLT F16 = LLT::scalar(16); 3259 3260 const AMDGPUTargetMachine &TM = 3261 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3262 3263 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3264 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3265 if (Ty == F16 && !ST.has16BitInsts()) { 3266 Register LogVal = MRI.createGenericVirtualRegister(F32); 3267 auto PromoteSrc = B.buildFPExt(F32, X); 3268 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3269 B.buildFPTrunc(Dst, LogVal); 3270 } else { 3271 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3272 } 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 3278 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3279 if (ScaledInput) 3280 X = ScaledInput; 3281 3282 auto Y = 3283 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 3284 3285 Register R; 3286 if (ST.hasFastFMAF32()) { 3287 // c+cc are ln(2)/ln(10) to more than 49 bits 3288 const float c_log10 = 0x1.344134p-2f; 3289 const float cc_log10 = 0x1.09f79ep-26f; 3290 3291 // c + cc is ln(2) to more than 49 bits 3292 const float c_log = 0x1.62e42ep-1f; 3293 const float cc_log = 0x1.efa39ep-25f; 3294 3295 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3296 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3297 3298 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3299 auto NegR = B.buildFNeg(Ty, R, Flags); 3300 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3301 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3302 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3303 } else { 3304 // ch+ct is ln(2)/ln(10) to more than 36 bits 3305 const float ch_log10 = 0x1.344000p-2f; 3306 const float ct_log10 = 0x1.3509f6p-18f; 3307 3308 // ch + ct is ln(2) to more than 36 bits 3309 const float ch_log = 0x1.62e000p-1f; 3310 const float ct_log = 0x1.0bfbe8p-15f; 3311 3312 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3313 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3314 3315 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3316 auto YH = B.buildAnd(Ty, Y, MaskConst); 3317 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3318 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3319 3320 Register Mad0 = 3321 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3322 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3323 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3324 } 3325 3326 const bool IsFiniteOnly = 3327 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3328 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3329 3330 if (!IsFiniteOnly) { 3331 // Expand isfinite(x) => fabs(x) < inf 3332 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3333 auto Fabs = B.buildFAbs(Ty, Y); 3334 auto IsFinite = 3335 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3336 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3337 } 3338 3339 if (ScaledInput) { 3340 auto Zero = B.buildFConstant(Ty, 0.0); 3341 auto ShiftK = 3342 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3343 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3344 B.buildFSub(Dst, R, Shift, Flags); 3345 } else { 3346 B.buildCopy(Dst, R); 3347 } 3348 3349 MI.eraseFromParent(); 3350 return true; 3351 } 3352 3353 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3354 Register Src, bool IsLog10, 3355 unsigned Flags) const { 3356 const double Log2BaseInverted = 3357 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3358 3359 LLT Ty = B.getMRI()->getType(Dst); 3360 3361 if (Ty == LLT::scalar(32)) { 3362 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3363 if (ScaledInput) { 3364 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3365 .addUse(Src) 3366 .setMIFlags(Flags); 3367 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3368 auto Zero = B.buildFConstant(Ty, 0.0); 3369 auto ResultOffset = 3370 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3371 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3372 3373 if (ST.hasFastFMAF32()) 3374 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3375 else { 3376 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3377 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3378 } 3379 3380 return true; 3381 } 3382 } 3383 3384 auto Log2Operand = Ty == LLT::scalar(16) 3385 ? B.buildFLog2(Ty, Src, Flags) 3386 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3387 .addUse(Src) 3388 .setMIFlags(Flags); 3389 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3390 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3391 return true; 3392 } 3393 3394 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3395 MachineIRBuilder &B) const { 3396 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3397 // If we have to handle denormals, scale up the input and adjust the result. 3398 3399 Register Dst = MI.getOperand(0).getReg(); 3400 Register Src = MI.getOperand(1).getReg(); 3401 unsigned Flags = MI.getFlags(); 3402 LLT Ty = B.getMRI()->getType(Dst); 3403 const LLT F16 = LLT::scalar(16); 3404 const LLT F32 = LLT::scalar(32); 3405 3406 if (Ty == F16) { 3407 // Nothing in half is a denormal when promoted to f32. 3408 auto Ext = B.buildFPExt(F32, Src, Flags); 3409 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 3410 .addUse(Ext.getReg(0)) 3411 .setMIFlags(Flags); 3412 B.buildFPTrunc(Dst, Log2, Flags); 3413 MI.eraseFromParent(); 3414 return true; 3415 } 3416 3417 assert(Ty == F32); 3418 3419 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3420 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3421 .addUse(Src) 3422 .setMIFlags(Flags); 3423 MI.eraseFromParent(); 3424 return true; 3425 } 3426 3427 // bool needs_scaling = x < -0x1.f80000p+6f; 3428 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3429 3430 // -nextafter(128.0, -1) 3431 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3432 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3433 RangeCheckConst, Flags); 3434 3435 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3436 auto Zero = B.buildFConstant(Ty, 0.0); 3437 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3438 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3439 3440 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3441 .addUse(AddInput.getReg(0)) 3442 .setMIFlags(Flags); 3443 3444 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3445 auto One = B.buildFConstant(Ty, 1.0); 3446 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3447 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3448 MI.eraseFromParent(); 3449 return true; 3450 } 3451 3452 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3453 Register X, unsigned Flags) const { 3454 LLT Ty = B.getMRI()->getType(Dst); 3455 LLT F32 = LLT::scalar(32); 3456 3457 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 3458 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3459 auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 3460 3461 if (Ty == F32) { 3462 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3463 .addUse(Mul.getReg(0)) 3464 .setMIFlags(Flags); 3465 } else { 3466 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3467 } 3468 3469 return true; 3470 } 3471 3472 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 3473 auto NeedsScaling = 3474 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 3475 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 3476 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 3477 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 3478 3479 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3480 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 3481 3482 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3483 .addUse(ExpInput.getReg(0)) 3484 .setMIFlags(Flags); 3485 3486 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 3487 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 3488 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 3489 return true; 3490 } 3491 3492 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3493 MachineIRBuilder &B) const { 3494 Register Dst = MI.getOperand(0).getReg(); 3495 Register X = MI.getOperand(1).getReg(); 3496 const unsigned Flags = MI.getFlags(); 3497 MachineFunction &MF = B.getMF(); 3498 MachineRegisterInfo &MRI = *B.getMRI(); 3499 LLT Ty = MRI.getType(Dst); 3500 const LLT F16 = LLT::scalar(16); 3501 const LLT F32 = LLT::scalar(32); 3502 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 3503 3504 if (Ty == F16) { 3505 // v_exp_f16 (fmul x, log2e) 3506 if (allowApproxFunc(MF, Flags)) { 3507 // TODO: Does this really require fast? 3508 legalizeFExpUnsafe(B, Dst, X, Flags); 3509 MI.eraseFromParent(); 3510 return true; 3511 } 3512 3513 // exp(f16 x) -> 3514 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3515 3516 // Nothing in half is a denormal when promoted to f32. 3517 auto Ext = B.buildFPExt(F32, X, Flags); 3518 Register Lowered = MRI.createGenericVirtualRegister(F32); 3519 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3520 B.buildFPTrunc(Dst, Lowered, Flags); 3521 MI.eraseFromParent(); 3522 return true; 3523 } 3524 3525 assert(Ty == F32); 3526 3527 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3528 // library behavior. Also, is known-not-daz source sufficient? 3529 if (allowApproxFunc(MF, Flags)) { 3530 legalizeFExpUnsafe(B, Dst, X, Flags); 3531 MI.eraseFromParent(); 3532 return true; 3533 } 3534 3535 // Algorithm: 3536 // 3537 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3538 // 3539 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3540 // n = 64*m + j, 0 <= j < 64 3541 // 3542 // e^x = 2^((64*m + j + f)/64) 3543 // = (2^m) * (2^(j/64)) * 2^(f/64) 3544 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3545 // 3546 // f = x*(64/ln(2)) - n 3547 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3548 // 3549 // e^x = (2^m) * (2^(j/64)) * e^r 3550 // 3551 // (2^(j/64)) is precomputed 3552 // 3553 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3554 // e^r = 1 + q 3555 // 3556 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3557 // 3558 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3559 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3560 Register PH, PL; 3561 3562 if (ST.hasFastFMAF32()) { 3563 const float c_exp = numbers::log2ef; 3564 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3565 const float c_exp10 = 0x1.a934f0p+1f; 3566 const float cc_exp10 = 0x1.2f346ep-24f; 3567 3568 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3569 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3570 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3571 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3572 3573 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3574 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3575 } else { 3576 const float ch_exp = 0x1.714000p+0f; 3577 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3578 3579 const float ch_exp10 = 0x1.a92000p+1f; 3580 const float cl_exp10 = 0x1.4f0978p-11f; 3581 3582 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3583 auto XH = B.buildAnd(Ty, X, MaskConst); 3584 auto XL = B.buildFSub(Ty, X, XH, Flags); 3585 3586 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3587 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3588 3589 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3590 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3591 3592 Register Mad0 = 3593 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3594 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3595 } 3596 3597 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 3598 3599 // It is unsafe to contract this fsub into the PH multiply. 3600 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3601 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3602 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3603 3604 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3605 .addUse(A.getReg(0)) 3606 .setMIFlags(Flags); 3607 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3608 3609 auto UnderflowCheckConst = 3610 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3611 auto Zero = B.buildFConstant(Ty, 0.0); 3612 auto Underflow = 3613 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3614 3615 R = B.buildSelect(Ty, Underflow, Zero, R); 3616 3617 const auto &Options = MF.getTarget().Options; 3618 3619 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3620 auto OverflowCheckConst = 3621 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3622 3623 auto Overflow = 3624 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3625 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3626 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3627 } 3628 3629 B.buildCopy(Dst, R); 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3635 MachineIRBuilder &B) const { 3636 Register Dst = MI.getOperand(0).getReg(); 3637 Register Src0 = MI.getOperand(1).getReg(); 3638 Register Src1 = MI.getOperand(2).getReg(); 3639 unsigned Flags = MI.getFlags(); 3640 LLT Ty = B.getMRI()->getType(Dst); 3641 const LLT F16 = LLT::float16(); 3642 const LLT F32 = LLT::float32(); 3643 3644 if (Ty == F32) { 3645 auto Log = B.buildFLog2(F32, Src0, Flags); 3646 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3647 .addUse(Log.getReg(0)) 3648 .addUse(Src1) 3649 .setMIFlags(Flags); 3650 B.buildFExp2(Dst, Mul, Flags); 3651 } else if (Ty == F16) { 3652 // There's no f16 fmul_legacy, so we need to convert for it. 3653 auto Log = B.buildFLog2(F16, Src0, Flags); 3654 auto Ext0 = B.buildFPExt(F32, Log, Flags); 3655 auto Ext1 = B.buildFPExt(F32, Src1, Flags); 3656 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3657 .addUse(Ext0.getReg(0)) 3658 .addUse(Ext1.getReg(0)) 3659 .setMIFlags(Flags); 3660 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 3661 } else 3662 return false; 3663 3664 MI.eraseFromParent(); 3665 return true; 3666 } 3667 3668 // Find a source register, ignoring any possible source modifiers. 3669 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3670 Register ModSrc = OrigSrc; 3671 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3672 ModSrc = SrcFNeg->getOperand(1).getReg(); 3673 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3674 ModSrc = SrcFAbs->getOperand(1).getReg(); 3675 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3676 ModSrc = SrcFAbs->getOperand(1).getReg(); 3677 return ModSrc; 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3681 MachineRegisterInfo &MRI, 3682 MachineIRBuilder &B) const { 3683 3684 const LLT S1 = LLT::scalar(1); 3685 const LLT F64 = LLT::float64(); 3686 Register Dst = MI.getOperand(0).getReg(); 3687 Register OrigSrc = MI.getOperand(1).getReg(); 3688 unsigned Flags = MI.getFlags(); 3689 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 3690 "this should not have been custom lowered"); 3691 3692 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3693 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3694 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3695 // V_FRACT bug is: 3696 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3697 // 3698 // Convert floor(x) to (x - fract(x)) 3699 3700 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 3701 .addUse(OrigSrc) 3702 .setMIFlags(Flags); 3703 3704 // Give source modifier matching some assistance before obscuring a foldable 3705 // pattern. 3706 3707 // TODO: We can avoid the neg on the fract? The input sign to fract 3708 // shouldn't matter? 3709 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3710 3711 auto Const = 3712 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3713 3714 Register Min = MRI.createGenericVirtualRegister(F64); 3715 3716 // We don't need to concern ourselves with the snan handling difference, so 3717 // use the one which will directly select. 3718 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3719 if (MFI->getMode().IEEE) 3720 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3721 else 3722 B.buildFMinNum(Min, Fract, Const, Flags); 3723 3724 Register CorrectedFract = Min; 3725 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3726 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3727 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 3728 } 3729 3730 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 3731 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3732 3733 MI.eraseFromParent(); 3734 return true; 3735 } 3736 3737 // Turn an illegal packed v2s16 build vector into bit operations. 3738 // TODO: This should probably be a bitcast action in LegalizerHelper. 3739 bool AMDGPULegalizerInfo::legalizeBuildVector( 3740 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3741 Register Dst = MI.getOperand(0).getReg(); 3742 const LLT S32 = LLT::scalar(32); 3743 const LLT S16 = LLT::scalar(16); 3744 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3745 3746 Register Src0 = MI.getOperand(1).getReg(); 3747 Register Src1 = MI.getOperand(2).getReg(); 3748 3749 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3750 assert(MRI.getType(Src0) == S32); 3751 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3752 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3753 } 3754 3755 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3756 B.buildBitcast(Dst, Merge); 3757 3758 MI.eraseFromParent(); 3759 return true; 3760 } 3761 3762 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3763 // 3764 // Source and accumulation registers must all be 32-bits. 3765 // 3766 // TODO: When the multiply is uniform, we should produce a code sequence 3767 // that is better suited to instruction selection on the SALU. Instead of 3768 // the outer loop going over parts of the result, the outer loop should go 3769 // over parts of one of the factors. This should result in instruction 3770 // selection that makes full use of S_ADDC_U32 instructions. 3771 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3772 MutableArrayRef<Register> Accum, 3773 ArrayRef<Register> Src0, 3774 ArrayRef<Register> Src1, 3775 bool UsePartialMad64_32, 3776 bool SeparateOddAlignedProducts) const { 3777 // Use (possibly empty) vectors of S1 registers to represent the set of 3778 // carries from one pair of positions to the next. 3779 using Carry = SmallVector<Register, 2>; 3780 3781 MachineIRBuilder &B = Helper.MIRBuilder; 3782 GISelKnownBits &KB = *Helper.getKnownBits(); 3783 3784 const LLT S1 = LLT::scalar(1); 3785 const LLT S32 = LLT::scalar(32); 3786 const LLT S64 = LLT::scalar(64); 3787 3788 Register Zero32; 3789 Register Zero64; 3790 3791 auto getZero32 = [&]() -> Register { 3792 if (!Zero32) 3793 Zero32 = B.buildConstant(S32, 0).getReg(0); 3794 return Zero32; 3795 }; 3796 auto getZero64 = [&]() -> Register { 3797 if (!Zero64) 3798 Zero64 = B.buildConstant(S64, 0).getReg(0); 3799 return Zero64; 3800 }; 3801 3802 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3803 for (unsigned i = 0; i < Src0.size(); ++i) { 3804 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3805 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3806 } 3807 3808 // Merge the given carries into the 32-bit LocalAccum, which is modified 3809 // in-place. 3810 // 3811 // Returns the carry-out, which is a single S1 register or null. 3812 auto mergeCarry = 3813 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3814 if (CarryIn.empty()) 3815 return Register(); 3816 3817 bool HaveCarryOut = true; 3818 Register CarryAccum; 3819 if (CarryIn.size() == 1) { 3820 if (!LocalAccum) { 3821 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3822 return Register(); 3823 } 3824 3825 CarryAccum = getZero32(); 3826 } else { 3827 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3828 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3829 CarryAccum = 3830 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3831 .getReg(0); 3832 } 3833 3834 if (!LocalAccum) { 3835 LocalAccum = getZero32(); 3836 HaveCarryOut = false; 3837 } 3838 } 3839 3840 auto Add = 3841 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3842 LocalAccum = Add.getReg(0); 3843 return HaveCarryOut ? Add.getReg(1) : Register(); 3844 }; 3845 3846 // Build a multiply-add chain to compute 3847 // 3848 // LocalAccum + (partial products at DstIndex) 3849 // + (opportunistic subset of CarryIn) 3850 // 3851 // LocalAccum is an array of one or two 32-bit registers that are updated 3852 // in-place. The incoming registers may be null. 3853 // 3854 // In some edge cases, carry-ins can be consumed "for free". In that case, 3855 // the consumed carry bits are removed from CarryIn in-place. 3856 auto buildMadChain = 3857 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3858 -> Carry { 3859 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3860 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3861 3862 Carry CarryOut; 3863 unsigned j0 = 0; 3864 3865 // Use plain 32-bit multiplication for the most significant part of the 3866 // result by default. 3867 if (LocalAccum.size() == 1 && 3868 (!UsePartialMad64_32 || !CarryIn.empty())) { 3869 do { 3870 // Skip multiplication if one of the operands is 0 3871 unsigned j1 = DstIndex - j0; 3872 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3873 ++j0; 3874 continue; 3875 } 3876 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3877 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3878 LocalAccum[0] = Mul.getReg(0); 3879 } else { 3880 if (CarryIn.empty()) { 3881 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3882 } else { 3883 LocalAccum[0] = 3884 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3885 .getReg(0); 3886 CarryIn.pop_back(); 3887 } 3888 } 3889 ++j0; 3890 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3891 } 3892 3893 // Build full 64-bit multiplies. 3894 if (j0 <= DstIndex) { 3895 bool HaveSmallAccum = false; 3896 Register Tmp; 3897 3898 if (LocalAccum[0]) { 3899 if (LocalAccum.size() == 1) { 3900 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3901 HaveSmallAccum = true; 3902 } else if (LocalAccum[1]) { 3903 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 3904 HaveSmallAccum = false; 3905 } else { 3906 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3907 HaveSmallAccum = true; 3908 } 3909 } else { 3910 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3911 Tmp = getZero64(); 3912 HaveSmallAccum = true; 3913 } 3914 3915 do { 3916 unsigned j1 = DstIndex - j0; 3917 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3918 ++j0; 3919 continue; 3920 } 3921 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3922 {Src0[j0], Src1[j1], Tmp}); 3923 Tmp = Mad.getReg(0); 3924 if (!HaveSmallAccum) 3925 CarryOut.push_back(Mad.getReg(1)); 3926 HaveSmallAccum = false; 3927 3928 ++j0; 3929 } while (j0 <= DstIndex); 3930 3931 auto Unmerge = B.buildUnmerge(S32, Tmp); 3932 LocalAccum[0] = Unmerge.getReg(0); 3933 if (LocalAccum.size() > 1) 3934 LocalAccum[1] = Unmerge.getReg(1); 3935 } 3936 3937 return CarryOut; 3938 }; 3939 3940 // Outer multiply loop, iterating over destination parts from least 3941 // significant to most significant parts. 3942 // 3943 // The columns of the following diagram correspond to the destination parts 3944 // affected by one iteration of the outer loop (ignoring boundary 3945 // conditions). 3946 // 3947 // Dest index relative to 2 * i: 1 0 -1 3948 // ------ 3949 // Carries from previous iteration: e o 3950 // Even-aligned partial product sum: E E . 3951 // Odd-aligned partial product sum: O O 3952 // 3953 // 'o' is OddCarry, 'e' is EvenCarry. 3954 // EE and OO are computed from partial products via buildMadChain and use 3955 // accumulation where possible and appropriate. 3956 // 3957 Register SeparateOddCarry; 3958 Carry EvenCarry; 3959 Carry OddCarry; 3960 3961 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3962 Carry OddCarryIn = std::move(OddCarry); 3963 Carry EvenCarryIn = std::move(EvenCarry); 3964 OddCarry.clear(); 3965 EvenCarry.clear(); 3966 3967 // Partial products at offset 2 * i. 3968 if (2 * i < Accum.size()) { 3969 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3970 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3971 } 3972 3973 // Partial products at offset 2 * i - 1. 3974 if (i > 0) { 3975 if (!SeparateOddAlignedProducts) { 3976 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3977 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3978 } else { 3979 bool IsHighest = 2 * i >= Accum.size(); 3980 Register SeparateOddOut[2]; 3981 auto LocalAccum = MutableArrayRef(SeparateOddOut) 3982 .take_front(IsHighest ? 1 : 2); 3983 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3984 3985 MachineInstr *Lo; 3986 3987 if (i == 1) { 3988 if (!IsHighest) 3989 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3990 else 3991 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3992 } else { 3993 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3994 SeparateOddCarry); 3995 } 3996 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3997 3998 if (!IsHighest) { 3999 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 4000 Lo->getOperand(1).getReg()); 4001 Accum[2 * i] = Hi.getReg(0); 4002 SeparateOddCarry = Hi.getReg(1); 4003 } 4004 } 4005 } 4006 4007 // Add in the carries from the previous iteration 4008 if (i > 0) { 4009 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 4010 EvenCarryIn.push_back(CarryOut); 4011 4012 if (2 * i < Accum.size()) { 4013 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 4014 OddCarry.push_back(CarryOut); 4015 } 4016 } 4017 } 4018 } 4019 4020 // Custom narrowing of wide multiplies using wide multiply-add instructions. 4021 // 4022 // TODO: If the multiply is followed by an addition, we should attempt to 4023 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 4024 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 4025 MachineInstr &MI) const { 4026 assert(ST.hasMad64_32()); 4027 assert(MI.getOpcode() == TargetOpcode::G_MUL); 4028 4029 MachineIRBuilder &B = Helper.MIRBuilder; 4030 MachineRegisterInfo &MRI = *B.getMRI(); 4031 4032 Register DstReg = MI.getOperand(0).getReg(); 4033 Register Src0 = MI.getOperand(1).getReg(); 4034 Register Src1 = MI.getOperand(2).getReg(); 4035 4036 LLT Ty = MRI.getType(DstReg); 4037 assert(Ty.isScalar()); 4038 4039 unsigned Size = Ty.getSizeInBits(); 4040 unsigned NumParts = Size / 32; 4041 assert((Size % 32) == 0); 4042 assert(NumParts >= 2); 4043 4044 // Whether to use MAD_64_32 for partial products whose high half is 4045 // discarded. This avoids some ADD instructions but risks false dependency 4046 // stalls on some subtargets in some cases. 4047 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 4048 4049 // Whether to compute odd-aligned partial products separately. This is 4050 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 4051 // in an even-aligned VGPR. 4052 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 4053 4054 LLT S32 = LLT::scalar(32); 4055 SmallVector<Register, 2> Src0Parts, Src1Parts; 4056 for (unsigned i = 0; i < NumParts; ++i) { 4057 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4058 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4059 } 4060 B.buildUnmerge(Src0Parts, Src0); 4061 B.buildUnmerge(Src1Parts, Src1); 4062 4063 SmallVector<Register, 2> AccumRegs(NumParts); 4064 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 4065 SeparateOddAlignedProducts); 4066 4067 B.buildMergeLikeInstr(DstReg, AccumRegs); 4068 MI.eraseFromParent(); 4069 return true; 4070 } 4071 4072 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4073 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4074 // case with a single min instruction instead of a compare+select. 4075 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4076 MachineRegisterInfo &MRI, 4077 MachineIRBuilder &B) const { 4078 Register Dst = MI.getOperand(0).getReg(); 4079 Register Src = MI.getOperand(1).getReg(); 4080 LLT DstTy = MRI.getType(Dst); 4081 LLT SrcTy = MRI.getType(Src); 4082 4083 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4084 ? AMDGPU::G_AMDGPU_FFBH_U32 4085 : AMDGPU::G_AMDGPU_FFBL_B32; 4086 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4087 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4088 4089 MI.eraseFromParent(); 4090 return true; 4091 } 4092 4093 // Check that this is a G_XOR x, -1 4094 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4095 if (MI.getOpcode() != TargetOpcode::G_XOR) 4096 return false; 4097 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4098 return ConstVal && *ConstVal == -1; 4099 } 4100 4101 // Return the use branch instruction, otherwise null if the usage is invalid. 4102 static MachineInstr * 4103 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4104 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 4105 Register CondDef = MI.getOperand(0).getReg(); 4106 if (!MRI.hasOneNonDBGUse(CondDef)) 4107 return nullptr; 4108 4109 MachineBasicBlock *Parent = MI.getParent(); 4110 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4111 4112 if (isNot(MRI, *UseMI)) { 4113 Register NegatedCond = UseMI->getOperand(0).getReg(); 4114 if (!MRI.hasOneNonDBGUse(NegatedCond)) 4115 return nullptr; 4116 4117 // We're deleting the def of this value, so we need to remove it. 4118 eraseInstr(*UseMI, MRI); 4119 4120 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4121 Negated = true; 4122 } 4123 4124 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4125 return nullptr; 4126 4127 // Make sure the cond br is followed by a G_BR, or is the last instruction. 4128 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 4129 if (Next == Parent->end()) { 4130 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 4131 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 4132 return nullptr; 4133 UncondBrTarget = &*NextMBB; 4134 } else { 4135 if (Next->getOpcode() != AMDGPU::G_BR) 4136 return nullptr; 4137 Br = &*Next; 4138 UncondBrTarget = Br->getOperand(0).getMBB(); 4139 } 4140 4141 return UseMI; 4142 } 4143 4144 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 4145 const ArgDescriptor *Arg, 4146 const TargetRegisterClass *ArgRC, 4147 LLT ArgTy) const { 4148 MCRegister SrcReg = Arg->getRegister(); 4149 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 4150 assert(DstReg.isVirtual() && "Virtual register expected"); 4151 4152 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4153 *ArgRC, B.getDebugLoc(), ArgTy); 4154 if (Arg->isMasked()) { 4155 // TODO: Should we try to emit this once in the entry block? 4156 const LLT S32 = LLT::scalar(32); 4157 const unsigned Mask = Arg->getMask(); 4158 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4159 4160 Register AndMaskSrc = LiveIn; 4161 4162 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4163 // 0. 4164 if (Shift != 0) { 4165 auto ShiftAmt = B.buildConstant(S32, Shift); 4166 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4167 } 4168 4169 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4170 } else { 4171 B.buildCopy(DstReg, LiveIn); 4172 } 4173 4174 return true; 4175 } 4176 4177 bool AMDGPULegalizerInfo::loadInputValue( 4178 Register DstReg, MachineIRBuilder &B, 4179 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4180 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4181 const ArgDescriptor *Arg = nullptr; 4182 const TargetRegisterClass *ArgRC; 4183 LLT ArgTy; 4184 4185 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 4186 const ArgDescriptor WorkGroupIDX = 4187 ArgDescriptor::createRegister(AMDGPU::TTMP9); 4188 // If GridZ is not programmed in an entry function then the hardware will set 4189 // it to all zeros, so there is no need to mask the GridY value in the low 4190 // order bits. 4191 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 4192 AMDGPU::TTMP7, 4193 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 4194 const ArgDescriptor WorkGroupIDZ = 4195 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 4196 if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { 4197 switch (ArgType) { 4198 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 4199 Arg = &WorkGroupIDX; 4200 ArgRC = &AMDGPU::SReg_32RegClass; 4201 ArgTy = LLT::scalar(32); 4202 break; 4203 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 4204 Arg = &WorkGroupIDY; 4205 ArgRC = &AMDGPU::SReg_32RegClass; 4206 ArgTy = LLT::scalar(32); 4207 break; 4208 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 4209 Arg = &WorkGroupIDZ; 4210 ArgRC = &AMDGPU::SReg_32RegClass; 4211 ArgTy = LLT::scalar(32); 4212 break; 4213 default: 4214 break; 4215 } 4216 } 4217 4218 if (!Arg) 4219 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4220 4221 if (!Arg) { 4222 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4223 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4224 // case the pointer argument may be missing and we use null. 4225 B.buildConstant(DstReg, 0); 4226 return true; 4227 } 4228 4229 // It's undefined behavior if a function marked with the amdgpu-no-* 4230 // attributes uses the corresponding intrinsic. 4231 B.buildUndef(DstReg); 4232 return true; 4233 } 4234 4235 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4236 return false; // TODO: Handle these 4237 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4238 } 4239 4240 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4241 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4242 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4243 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4244 return false; 4245 4246 MI.eraseFromParent(); 4247 return true; 4248 } 4249 4250 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4251 int64_t C) { 4252 B.buildConstant(MI.getOperand(0).getReg(), C); 4253 MI.eraseFromParent(); 4254 return true; 4255 } 4256 4257 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4258 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4259 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4260 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4261 if (MaxID == 0) 4262 return replaceWithConstant(B, MI, 0); 4263 4264 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4265 const ArgDescriptor *Arg; 4266 const TargetRegisterClass *ArgRC; 4267 LLT ArgTy; 4268 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4269 4270 Register DstReg = MI.getOperand(0).getReg(); 4271 if (!Arg) { 4272 // It's undefined behavior if a function marked with the amdgpu-no-* 4273 // attributes uses the corresponding intrinsic. 4274 B.buildUndef(DstReg); 4275 MI.eraseFromParent(); 4276 return true; 4277 } 4278 4279 if (Arg->isMasked()) { 4280 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4281 // masking operations anyway. 4282 // 4283 // TODO: We could assert the top bit is 0 for the source copy. 4284 if (!loadInputValue(DstReg, B, ArgType)) 4285 return false; 4286 } else { 4287 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4288 if (!loadInputValue(TmpReg, B, ArgType)) 4289 return false; 4290 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4291 } 4292 4293 MI.eraseFromParent(); 4294 return true; 4295 } 4296 4297 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4298 int64_t Offset) const { 4299 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4300 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4301 4302 // TODO: If we passed in the base kernel offset we could have a better 4303 // alignment than 4, but we don't really need it. 4304 if (!loadInputValue(KernArgReg, B, 4305 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4306 llvm_unreachable("failed to find kernarg segment ptr"); 4307 4308 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4309 // TODO: Should get nuw 4310 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4311 } 4312 4313 /// Legalize a value that's loaded from kernel arguments. This is only used by 4314 /// legacy intrinsics. 4315 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4316 MachineIRBuilder &B, 4317 uint64_t Offset, 4318 Align Alignment) const { 4319 Register DstReg = MI.getOperand(0).getReg(); 4320 4321 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4322 "unexpected kernarg parameter type"); 4323 4324 Register Ptr = getKernargParameterPtr(B, Offset); 4325 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4326 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4327 MachineMemOperand::MODereferenceable | 4328 MachineMemOperand::MOInvariant); 4329 MI.eraseFromParent(); 4330 return true; 4331 } 4332 4333 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4334 MachineRegisterInfo &MRI, 4335 MachineIRBuilder &B) const { 4336 Register Dst = MI.getOperand(0).getReg(); 4337 LLT DstTy = MRI.getType(Dst); 4338 LLT S16 = LLT::scalar(16); 4339 LLT S32 = LLT::scalar(32); 4340 LLT S64 = LLT::scalar(64); 4341 4342 if (DstTy == S16) 4343 return legalizeFDIV16(MI, MRI, B); 4344 if (DstTy == S32) 4345 return legalizeFDIV32(MI, MRI, B); 4346 if (DstTy == S64) 4347 return legalizeFDIV64(MI, MRI, B); 4348 4349 return false; 4350 } 4351 4352 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4353 Register DstDivReg, 4354 Register DstRemReg, 4355 Register X, 4356 Register Y) const { 4357 const LLT S1 = LLT::scalar(1); 4358 const LLT S32 = LLT::scalar(32); 4359 4360 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4361 // algorithm used here. 4362 4363 // Initial estimate of inv(y). 4364 auto FloatY = B.buildUITOFP(S32, Y); 4365 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4366 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4367 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4368 auto Z = B.buildFPTOUI(S32, ScaledY); 4369 4370 // One round of UNR. 4371 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4372 auto NegYZ = B.buildMul(S32, NegY, Z); 4373 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4374 4375 // Quotient/remainder estimate. 4376 auto Q = B.buildUMulH(S32, X, Z); 4377 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4378 4379 // First quotient/remainder refinement. 4380 auto One = B.buildConstant(S32, 1); 4381 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4382 if (DstDivReg) 4383 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4384 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4385 4386 // Second quotient/remainder refinement. 4387 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4388 if (DstDivReg) 4389 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4390 4391 if (DstRemReg) 4392 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4393 } 4394 4395 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4396 // 4397 // Return lo, hi of result 4398 // 4399 // %cvt.lo = G_UITOFP Val.lo 4400 // %cvt.hi = G_UITOFP Val.hi 4401 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4402 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4403 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4404 // %mul2 = G_FMUL %mul1, 2**(-32) 4405 // %trunc = G_INTRINSIC_TRUNC %mul2 4406 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4407 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4408 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4409 Register Val) { 4410 const LLT S32 = LLT::scalar(32); 4411 auto Unmerge = B.buildUnmerge(S32, Val); 4412 4413 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4414 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4415 4416 auto Mad = B.buildFMAD( 4417 S32, CvtHi, // 2**32 4418 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4419 4420 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4421 auto Mul1 = B.buildFMul( 4422 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4423 4424 // 2**(-32) 4425 auto Mul2 = B.buildFMul( 4426 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4427 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4428 4429 // -(2**32) 4430 auto Mad2 = B.buildFMAD( 4431 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4432 Mul1); 4433 4434 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4435 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4436 4437 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4438 } 4439 4440 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4441 Register DstDivReg, 4442 Register DstRemReg, 4443 Register Numer, 4444 Register Denom) const { 4445 const LLT S32 = LLT::scalar(32); 4446 const LLT S64 = LLT::scalar(64); 4447 const LLT S1 = LLT::scalar(1); 4448 Register RcpLo, RcpHi; 4449 4450 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4451 4452 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4453 4454 auto Zero64 = B.buildConstant(S64, 0); 4455 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4456 4457 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4458 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4459 4460 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4461 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4462 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4463 4464 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4465 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4466 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4467 4468 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4469 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4470 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4471 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4472 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4473 4474 auto Zero32 = B.buildConstant(S32, 0); 4475 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4476 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4477 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4478 4479 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4480 Register NumerLo = UnmergeNumer.getReg(0); 4481 Register NumerHi = UnmergeNumer.getReg(1); 4482 4483 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4484 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4485 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4486 Register Mul3_Lo = UnmergeMul3.getReg(0); 4487 Register Mul3_Hi = UnmergeMul3.getReg(1); 4488 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4489 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4490 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4491 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4492 4493 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4494 Register DenomLo = UnmergeDenom.getReg(0); 4495 Register DenomHi = UnmergeDenom.getReg(1); 4496 4497 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4498 auto C1 = B.buildSExt(S32, CmpHi); 4499 4500 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4501 auto C2 = B.buildSExt(S32, CmpLo); 4502 4503 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4504 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4505 4506 // TODO: Here and below portions of the code can be enclosed into if/endif. 4507 // Currently control flow is unconditional and we have 4 selects after 4508 // potential endif to substitute PHIs. 4509 4510 // if C3 != 0 ... 4511 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4512 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4513 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4514 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4515 4516 auto One64 = B.buildConstant(S64, 1); 4517 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4518 4519 auto C4 = 4520 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4521 auto C5 = 4522 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4523 auto C6 = B.buildSelect( 4524 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4525 4526 // if (C6 != 0) 4527 auto Add4 = B.buildAdd(S64, Add3, One64); 4528 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4529 4530 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4531 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4532 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4533 4534 // endif C6 4535 // endif C3 4536 4537 if (DstDivReg) { 4538 auto Sel1 = B.buildSelect( 4539 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4540 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4541 Sel1, MulHi3); 4542 } 4543 4544 if (DstRemReg) { 4545 auto Sel2 = B.buildSelect( 4546 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4547 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4548 Sel2, Sub1); 4549 } 4550 } 4551 4552 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4553 MachineRegisterInfo &MRI, 4554 MachineIRBuilder &B) const { 4555 Register DstDivReg, DstRemReg; 4556 switch (MI.getOpcode()) { 4557 default: 4558 llvm_unreachable("Unexpected opcode!"); 4559 case AMDGPU::G_UDIV: { 4560 DstDivReg = MI.getOperand(0).getReg(); 4561 break; 4562 } 4563 case AMDGPU::G_UREM: { 4564 DstRemReg = MI.getOperand(0).getReg(); 4565 break; 4566 } 4567 case AMDGPU::G_UDIVREM: { 4568 DstDivReg = MI.getOperand(0).getReg(); 4569 DstRemReg = MI.getOperand(1).getReg(); 4570 break; 4571 } 4572 } 4573 4574 const LLT S64 = LLT::scalar(64); 4575 const LLT S32 = LLT::scalar(32); 4576 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4577 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4578 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4579 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4580 4581 if (Ty == S32) 4582 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4583 else if (Ty == S64) 4584 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4585 else 4586 return false; 4587 4588 MI.eraseFromParent(); 4589 return true; 4590 } 4591 4592 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4593 MachineRegisterInfo &MRI, 4594 MachineIRBuilder &B) const { 4595 const LLT S64 = LLT::scalar(64); 4596 const LLT S32 = LLT::scalar(32); 4597 4598 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4599 if (Ty != S32 && Ty != S64) 4600 return false; 4601 4602 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4603 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4604 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4605 4606 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4607 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4608 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4609 4610 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4611 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4612 4613 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4614 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4615 4616 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4617 switch (MI.getOpcode()) { 4618 default: 4619 llvm_unreachable("Unexpected opcode!"); 4620 case AMDGPU::G_SDIV: { 4621 DstDivReg = MI.getOperand(0).getReg(); 4622 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4623 break; 4624 } 4625 case AMDGPU::G_SREM: { 4626 DstRemReg = MI.getOperand(0).getReg(); 4627 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4628 break; 4629 } 4630 case AMDGPU::G_SDIVREM: { 4631 DstDivReg = MI.getOperand(0).getReg(); 4632 DstRemReg = MI.getOperand(1).getReg(); 4633 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4634 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4635 break; 4636 } 4637 } 4638 4639 if (Ty == S32) 4640 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4641 else 4642 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4643 4644 if (DstDivReg) { 4645 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4646 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4647 B.buildSub(DstDivReg, SignXor, Sign); 4648 } 4649 4650 if (DstRemReg) { 4651 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4652 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4653 B.buildSub(DstRemReg, SignXor, Sign); 4654 } 4655 4656 MI.eraseFromParent(); 4657 return true; 4658 } 4659 4660 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4661 MachineRegisterInfo &MRI, 4662 MachineIRBuilder &B) const { 4663 Register Res = MI.getOperand(0).getReg(); 4664 Register LHS = MI.getOperand(1).getReg(); 4665 Register RHS = MI.getOperand(2).getReg(); 4666 uint16_t Flags = MI.getFlags(); 4667 LLT ResTy = MRI.getType(Res); 4668 4669 const MachineFunction &MF = B.getMF(); 4670 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4671 MF.getTarget().Options.UnsafeFPMath; 4672 4673 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 4674 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4675 return false; 4676 4677 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4678 // the CI documentation has a worst case error of 1 ulp. 4679 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4680 // use it as long as we aren't trying to use denormals. 4681 // 4682 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4683 4684 // 1 / x -> RCP(x) 4685 if (CLHS->isExactlyValue(1.0)) { 4686 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4687 .addUse(RHS) 4688 .setMIFlags(Flags); 4689 4690 MI.eraseFromParent(); 4691 return true; 4692 } 4693 4694 // -1 / x -> RCP( FNEG(x) ) 4695 if (CLHS->isExactlyValue(-1.0)) { 4696 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4697 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4698 .addUse(FNeg.getReg(0)) 4699 .setMIFlags(Flags); 4700 4701 MI.eraseFromParent(); 4702 return true; 4703 } 4704 } 4705 4706 // For f16 require afn or arcp. 4707 // For f32 require afn. 4708 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4709 !MI.getFlag(MachineInstr::FmArcp))) 4710 return false; 4711 4712 // x / y -> x * (1.0 / y) 4713 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4714 .addUse(RHS) 4715 .setMIFlags(Flags); 4716 B.buildFMul(Res, LHS, RCP, Flags); 4717 4718 MI.eraseFromParent(); 4719 return true; 4720 } 4721 4722 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4723 MachineRegisterInfo &MRI, 4724 MachineIRBuilder &B) const { 4725 Register Res = MI.getOperand(0).getReg(); 4726 Register X = MI.getOperand(1).getReg(); 4727 Register Y = MI.getOperand(2).getReg(); 4728 uint16_t Flags = MI.getFlags(); 4729 LLT ResTy = MRI.getType(Res); 4730 4731 const MachineFunction &MF = B.getMF(); 4732 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4733 MI.getFlag(MachineInstr::FmAfn); 4734 4735 if (!AllowInaccurateRcp) 4736 return false; 4737 4738 auto NegY = B.buildFNeg(ResTy, Y); 4739 auto One = B.buildFConstant(ResTy, 1.0); 4740 4741 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4742 .addUse(Y) 4743 .setMIFlags(Flags); 4744 4745 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4746 R = B.buildFMA(ResTy, Tmp0, R, R); 4747 4748 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4749 R = B.buildFMA(ResTy, Tmp1, R, R); 4750 4751 auto Ret = B.buildFMul(ResTy, X, R); 4752 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4753 4754 B.buildFMA(Res, Tmp2, R, Ret); 4755 MI.eraseFromParent(); 4756 return true; 4757 } 4758 4759 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4760 MachineRegisterInfo &MRI, 4761 MachineIRBuilder &B) const { 4762 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4763 return true; 4764 4765 Register Res = MI.getOperand(0).getReg(); 4766 Register LHS = MI.getOperand(1).getReg(); 4767 Register RHS = MI.getOperand(2).getReg(); 4768 4769 uint16_t Flags = MI.getFlags(); 4770 4771 LLT S16 = LLT::scalar(16); 4772 LLT S32 = LLT::scalar(32); 4773 4774 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4775 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4776 4777 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4778 .addUse(RHSExt.getReg(0)) 4779 .setMIFlags(Flags); 4780 4781 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4782 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4783 4784 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4785 .addUse(RDst.getReg(0)) 4786 .addUse(RHS) 4787 .addUse(LHS) 4788 .setMIFlags(Flags); 4789 4790 MI.eraseFromParent(); 4791 return true; 4792 } 4793 4794 static const unsigned SPDenormModeBitField = 4795 AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 4796 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 4797 4798 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4799 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4800 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4801 const GCNSubtarget &ST, 4802 SIModeRegisterDefaults Mode) { 4803 // Set SP denorm mode to this value. 4804 unsigned SPDenormMode = 4805 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4806 4807 if (ST.hasDenormModeInst()) { 4808 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4809 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4810 4811 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4812 B.buildInstr(AMDGPU::S_DENORM_MODE) 4813 .addImm(NewDenormModeValue); 4814 4815 } else { 4816 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4817 .addImm(SPDenormMode) 4818 .addImm(SPDenormModeBitField); 4819 } 4820 } 4821 4822 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4823 MachineRegisterInfo &MRI, 4824 MachineIRBuilder &B) const { 4825 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4826 return true; 4827 4828 Register Res = MI.getOperand(0).getReg(); 4829 Register LHS = MI.getOperand(1).getReg(); 4830 Register RHS = MI.getOperand(2).getReg(); 4831 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4832 SIModeRegisterDefaults Mode = MFI->getMode(); 4833 4834 uint16_t Flags = MI.getFlags(); 4835 4836 LLT S32 = LLT::scalar(32); 4837 LLT S1 = LLT::scalar(1); 4838 4839 auto One = B.buildFConstant(S32, 1.0f); 4840 4841 auto DenominatorScaled = 4842 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4843 .addUse(LHS) 4844 .addUse(RHS) 4845 .addImm(0) 4846 .setMIFlags(Flags); 4847 auto NumeratorScaled = 4848 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4849 .addUse(LHS) 4850 .addUse(RHS) 4851 .addImm(1) 4852 .setMIFlags(Flags); 4853 4854 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4855 .addUse(DenominatorScaled.getReg(0)) 4856 .setMIFlags(Flags); 4857 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4858 4859 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 4860 const bool HasDynamicDenormals = 4861 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 4862 (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 4863 4864 Register SavedSPDenormMode; 4865 if (!PreservesDenormals) { 4866 if (HasDynamicDenormals) { 4867 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 4868 B.buildInstr(AMDGPU::S_GETREG_B32) 4869 .addDef(SavedSPDenormMode) 4870 .addImm(SPDenormModeBitField); 4871 } 4872 toggleSPDenormMode(true, B, ST, Mode); 4873 } 4874 4875 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4876 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4877 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4878 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4879 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4880 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4881 4882 if (!PreservesDenormals) { 4883 if (HasDynamicDenormals) { 4884 assert(SavedSPDenormMode); 4885 B.buildInstr(AMDGPU::S_SETREG_B32) 4886 .addReg(SavedSPDenormMode) 4887 .addImm(SPDenormModeBitField); 4888 } else 4889 toggleSPDenormMode(false, B, ST, Mode); 4890 } 4891 4892 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 4893 .addUse(Fma4.getReg(0)) 4894 .addUse(Fma1.getReg(0)) 4895 .addUse(Fma3.getReg(0)) 4896 .addUse(NumeratorScaled.getReg(1)) 4897 .setMIFlags(Flags); 4898 4899 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4900 .addUse(Fmas.getReg(0)) 4901 .addUse(RHS) 4902 .addUse(LHS) 4903 .setMIFlags(Flags); 4904 4905 MI.eraseFromParent(); 4906 return true; 4907 } 4908 4909 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4910 MachineRegisterInfo &MRI, 4911 MachineIRBuilder &B) const { 4912 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4913 return true; 4914 4915 Register Res = MI.getOperand(0).getReg(); 4916 Register LHS = MI.getOperand(1).getReg(); 4917 Register RHS = MI.getOperand(2).getReg(); 4918 4919 uint16_t Flags = MI.getFlags(); 4920 4921 LLT S64 = LLT::scalar(64); 4922 LLT S1 = LLT::scalar(1); 4923 4924 auto One = B.buildFConstant(S64, 1.0); 4925 4926 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4927 .addUse(LHS) 4928 .addUse(RHS) 4929 .addImm(0) 4930 .setMIFlags(Flags); 4931 4932 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4933 4934 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 4935 .addUse(DivScale0.getReg(0)) 4936 .setMIFlags(Flags); 4937 4938 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4939 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4940 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4941 4942 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4943 .addUse(LHS) 4944 .addUse(RHS) 4945 .addImm(1) 4946 .setMIFlags(Flags); 4947 4948 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4949 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4950 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4951 4952 Register Scale; 4953 if (!ST.hasUsableDivScaleConditionOutput()) { 4954 // Workaround a hardware bug on SI where the condition output from div_scale 4955 // is not usable. 4956 4957 LLT S32 = LLT::scalar(32); 4958 4959 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4960 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4961 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4962 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4963 4964 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4965 Scale1Unmerge.getReg(1)); 4966 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4967 Scale0Unmerge.getReg(1)); 4968 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4969 } else { 4970 Scale = DivScale1.getReg(1); 4971 } 4972 4973 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 4974 .addUse(Fma4.getReg(0)) 4975 .addUse(Fma3.getReg(0)) 4976 .addUse(Mul.getReg(0)) 4977 .addUse(Scale) 4978 .setMIFlags(Flags); 4979 4980 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 4981 .addUse(Fmas.getReg(0)) 4982 .addUse(RHS) 4983 .addUse(LHS) 4984 .setMIFlags(Flags); 4985 4986 MI.eraseFromParent(); 4987 return true; 4988 } 4989 4990 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 4991 MachineRegisterInfo &MRI, 4992 MachineIRBuilder &B) const { 4993 Register Res0 = MI.getOperand(0).getReg(); 4994 Register Res1 = MI.getOperand(1).getReg(); 4995 Register Val = MI.getOperand(2).getReg(); 4996 uint16_t Flags = MI.getFlags(); 4997 4998 LLT Ty = MRI.getType(Res0); 4999 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 5000 5001 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 5002 .addUse(Val) 5003 .setMIFlags(Flags); 5004 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 5005 .addUse(Val) 5006 .setMIFlags(Flags); 5007 5008 if (ST.hasFractBug()) { 5009 auto Fabs = B.buildFAbs(Ty, Val); 5010 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 5011 auto IsFinite = 5012 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 5013 auto Zero = B.buildConstant(InstrExpTy, 0); 5014 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 5015 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 5016 } 5017 5018 B.buildCopy(Res0, Mant); 5019 B.buildSExtOrTrunc(Res1, Exp); 5020 5021 MI.eraseFromParent(); 5022 return true; 5023 } 5024 5025 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 5026 MachineRegisterInfo &MRI, 5027 MachineIRBuilder &B) const { 5028 Register Res = MI.getOperand(0).getReg(); 5029 Register LHS = MI.getOperand(2).getReg(); 5030 Register RHS = MI.getOperand(3).getReg(); 5031 uint16_t Flags = MI.getFlags(); 5032 5033 LLT S32 = LLT::scalar(32); 5034 LLT S1 = LLT::scalar(1); 5035 5036 auto Abs = B.buildFAbs(S32, RHS, Flags); 5037 const APFloat C0Val(1.0f); 5038 5039 auto C0 = B.buildFConstant(S32, 0x1p+96f); 5040 auto C1 = B.buildFConstant(S32, 0x1p-32f); 5041 auto C2 = B.buildFConstant(S32, 1.0f); 5042 5043 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 5044 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 5045 5046 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 5047 5048 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5049 .addUse(Mul0.getReg(0)) 5050 .setMIFlags(Flags); 5051 5052 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 5053 5054 B.buildFMul(Res, Sel, Mul1, Flags); 5055 5056 MI.eraseFromParent(); 5057 return true; 5058 } 5059 5060 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 5061 MachineRegisterInfo &MRI, 5062 MachineIRBuilder &B) const { 5063 // Bypass the correct expansion a standard promotion through G_FSQRT would 5064 // get. The f32 op is accurate enough for the f16 cas. 5065 unsigned Flags = MI.getFlags(); 5066 assert(!ST.has16BitInsts()); 5067 const LLT F32 = LLT::scalar(32); 5068 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 5069 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 5070 .addUse(Ext.getReg(0)) 5071 .setMIFlags(Flags); 5072 B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 5073 MI.eraseFromParent(); 5074 return true; 5075 } 5076 5077 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 5078 MachineRegisterInfo &MRI, 5079 MachineIRBuilder &B) const { 5080 MachineFunction &MF = B.getMF(); 5081 Register Dst = MI.getOperand(0).getReg(); 5082 Register X = MI.getOperand(1).getReg(); 5083 const unsigned Flags = MI.getFlags(); 5084 const LLT S1 = LLT::scalar(1); 5085 const LLT F32 = LLT::scalar(32); 5086 const LLT I32 = LLT::scalar(32); 5087 5088 if (allowApproxFunc(MF, Flags)) { 5089 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 5090 .addUse(X) 5091 .setMIFlags(Flags); 5092 MI.eraseFromParent(); 5093 return true; 5094 } 5095 5096 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 5097 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 5098 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 5099 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 5100 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 5101 5102 Register SqrtS = MRI.createGenericVirtualRegister(F32); 5103 if (needsDenormHandlingF32(MF, X, Flags)) { 5104 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 5105 .addUse(SqrtX.getReg(0)) 5106 .setMIFlags(Flags); 5107 5108 auto NegOne = B.buildConstant(I32, -1); 5109 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 5110 5111 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 5112 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 5113 5114 auto PosOne = B.buildConstant(I32, 1); 5115 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 5116 5117 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 5118 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 5119 5120 auto Zero = B.buildFConstant(F32, 0.0f); 5121 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 5122 5123 SqrtS = 5124 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 5125 5126 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 5127 SqrtS = 5128 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 5129 } else { 5130 auto SqrtR = 5131 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 5132 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 5133 5134 auto Half = B.buildFConstant(F32, 0.5f); 5135 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 5136 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 5137 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 5138 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 5139 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 5140 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 5141 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 5142 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 5143 } 5144 5145 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 5146 5147 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 5148 5149 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 5150 5151 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5152 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 5153 5154 MI.eraseFromParent(); 5155 return true; 5156 } 5157 5158 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 5159 MachineRegisterInfo &MRI, 5160 MachineIRBuilder &B) const { 5161 // For double type, the SQRT and RSQ instructions don't have required 5162 // precision, we apply Goldschmidt's algorithm to improve the result: 5163 // 5164 // y0 = rsq(x) 5165 // g0 = x * y0 5166 // h0 = 0.5 * y0 5167 // 5168 // r0 = 0.5 - h0 * g0 5169 // g1 = g0 * r0 + g0 5170 // h1 = h0 * r0 + h0 5171 // 5172 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 5173 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 5174 // h2 = h1 * r1 + h1 5175 // 5176 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 5177 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 5178 // 5179 // sqrt(x) = g3 5180 5181 const LLT S1 = LLT::scalar(1); 5182 const LLT S32 = LLT::scalar(32); 5183 const LLT F64 = LLT::scalar(64); 5184 5185 Register Dst = MI.getOperand(0).getReg(); 5186 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 5187 5188 Register X = MI.getOperand(1).getReg(); 5189 unsigned Flags = MI.getFlags(); 5190 5191 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 5192 5193 auto ZeroInt = B.buildConstant(S32, 0); 5194 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 5195 5196 // Scale up input if it is too small. 5197 auto ScaleUpFactor = B.buildConstant(S32, 256); 5198 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 5199 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 5200 5201 auto SqrtY = 5202 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 5203 5204 auto Half = B.buildFConstant(F64, 0.5); 5205 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 5206 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 5207 5208 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 5209 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 5210 5211 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 5212 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 5213 5214 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 5215 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 5216 5217 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 5218 5219 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 5220 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 5221 5222 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 5223 5224 // Scale down the result. 5225 auto ScaleDownFactor = B.buildConstant(S32, -128); 5226 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 5227 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 5228 5229 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 5230 // with finite only or nsz because rsq(+/-0) = +/-inf 5231 5232 // TODO: Check for DAZ and expand to subnormals 5233 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5234 5235 // If x is +INF, +0, or -0, use its original value 5236 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 5237 5238 MI.eraseFromParent(); 5239 return true; 5240 } 5241 5242 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 5243 MachineRegisterInfo &MRI, 5244 MachineIRBuilder &B) const { 5245 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 5246 if (Ty == LLT::scalar(32)) 5247 return legalizeFSQRTF32(MI, MRI, B); 5248 if (Ty == LLT::scalar(64)) 5249 return legalizeFSQRTF64(MI, MRI, B); 5250 if (Ty == LLT::scalar(16)) 5251 return legalizeFSQRTF16(MI, MRI, B); 5252 return false; 5253 } 5254 5255 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5256 // FIXME: Why do we handle this one but not other removed instructions? 5257 // 5258 // Reciprocal square root. The clamp prevents infinite results, clamping 5259 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5260 // +-max_float. 5261 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5262 MachineRegisterInfo &MRI, 5263 MachineIRBuilder &B) const { 5264 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5265 return true; 5266 5267 Register Dst = MI.getOperand(0).getReg(); 5268 Register Src = MI.getOperand(2).getReg(); 5269 auto Flags = MI.getFlags(); 5270 5271 LLT Ty = MRI.getType(Dst); 5272 5273 const fltSemantics *FltSemantics; 5274 if (Ty == LLT::scalar(32)) 5275 FltSemantics = &APFloat::IEEEsingle(); 5276 else if (Ty == LLT::scalar(64)) 5277 FltSemantics = &APFloat::IEEEdouble(); 5278 else 5279 return false; 5280 5281 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5282 .addUse(Src) 5283 .setMIFlags(Flags); 5284 5285 // We don't need to concern ourselves with the snan handling difference, since 5286 // the rsq quieted (or not) so use the one which will directly select. 5287 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5288 const bool UseIEEE = MFI->getMode().IEEE; 5289 5290 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5291 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5292 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5293 5294 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5295 5296 if (UseIEEE) 5297 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5298 else 5299 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5300 MI.eraseFromParent(); 5301 return true; 5302 } 5303 5304 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 5305 switch (IID) { 5306 case Intrinsic::amdgcn_ds_fadd: 5307 return AMDGPU::G_ATOMICRMW_FADD; 5308 case Intrinsic::amdgcn_ds_fmin: 5309 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 5310 case Intrinsic::amdgcn_ds_fmax: 5311 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 5312 default: 5313 llvm_unreachable("not a DS FP intrinsic"); 5314 } 5315 } 5316 5317 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 5318 MachineInstr &MI, 5319 Intrinsic::ID IID) const { 5320 GISelChangeObserver &Observer = Helper.Observer; 5321 Observer.changingInstr(MI); 5322 5323 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 5324 5325 // The remaining operands were used to set fields in the MemOperand on 5326 // construction. 5327 for (int I = 6; I > 3; --I) 5328 MI.removeOperand(I); 5329 5330 MI.removeOperand(1); // Remove the intrinsic ID. 5331 Observer.changedInstr(MI); 5332 return true; 5333 } 5334 5335 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5336 MachineRegisterInfo &MRI, 5337 MachineIRBuilder &B) const { 5338 uint64_t Offset = 5339 ST.getTargetLowering()->getImplicitParameterOffset( 5340 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5341 LLT DstTy = MRI.getType(DstReg); 5342 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5343 5344 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5345 if (!loadInputValue(KernargPtrReg, B, 5346 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5347 return false; 5348 5349 // FIXME: This should be nuw 5350 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5351 return true; 5352 } 5353 5354 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5355 /// bits of the pointer and replace them with the stride argument, then 5356 /// merge_values everything together. In the common case of a raw buffer (the 5357 /// stride component is 0), we can just AND off the upper half. 5358 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5359 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5360 Register Result = MI.getOperand(0).getReg(); 5361 Register Pointer = MI.getOperand(2).getReg(); 5362 Register Stride = MI.getOperand(3).getReg(); 5363 Register NumRecords = MI.getOperand(4).getReg(); 5364 Register Flags = MI.getOperand(5).getReg(); 5365 5366 LLT S32 = LLT::scalar(32); 5367 5368 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5369 auto Unmerge = B.buildUnmerge(S32, Pointer); 5370 Register LowHalf = Unmerge.getReg(0); 5371 Register HighHalf = Unmerge.getReg(1); 5372 5373 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5374 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5375 5376 MachineInstrBuilder NewHighHalf = Masked; 5377 std::optional<ValueAndVReg> StrideConst = 5378 getIConstantVRegValWithLookThrough(Stride, MRI); 5379 if (!StrideConst || !StrideConst->Value.isZero()) { 5380 MachineInstrBuilder ShiftedStride; 5381 if (StrideConst) { 5382 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5383 uint32_t ShiftedStrideVal = StrideVal << 16; 5384 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5385 } else { 5386 auto ExtStride = B.buildAnyExt(S32, Stride); 5387 auto ShiftConst = B.buildConstant(S32, 16); 5388 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5389 } 5390 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5391 } 5392 Register NewHighHalfReg = NewHighHalf.getReg(0); 5393 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5394 MI.eraseFromParent(); 5395 return true; 5396 } 5397 5398 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5399 MachineRegisterInfo &MRI, 5400 MachineIRBuilder &B) const { 5401 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5402 if (!MFI->isEntryFunction()) { 5403 return legalizePreloadedArgIntrin(MI, MRI, B, 5404 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5405 } 5406 5407 Register DstReg = MI.getOperand(0).getReg(); 5408 if (!getImplicitArgPtr(DstReg, MRI, B)) 5409 return false; 5410 5411 MI.eraseFromParent(); 5412 return true; 5413 } 5414 5415 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5416 MachineRegisterInfo &MRI, 5417 MachineIRBuilder &B) const { 5418 Function &F = B.getMF().getFunction(); 5419 std::optional<uint32_t> KnownSize = 5420 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5421 if (KnownSize.has_value()) 5422 B.buildConstant(DstReg, *KnownSize); 5423 return false; 5424 } 5425 5426 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5427 MachineRegisterInfo &MRI, 5428 MachineIRBuilder &B) const { 5429 5430 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5431 if (!MFI->isEntryFunction()) { 5432 return legalizePreloadedArgIntrin(MI, MRI, B, 5433 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5434 } 5435 5436 Register DstReg = MI.getOperand(0).getReg(); 5437 if (!getLDSKernelId(DstReg, MRI, B)) 5438 return false; 5439 5440 MI.eraseFromParent(); 5441 return true; 5442 } 5443 5444 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5445 MachineRegisterInfo &MRI, 5446 MachineIRBuilder &B, 5447 unsigned AddrSpace) const { 5448 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5449 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5450 Register Hi32 = Unmerge.getReg(1); 5451 5452 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5453 MI.eraseFromParent(); 5454 return true; 5455 } 5456 5457 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5458 // offset (the offset that is included in bounds checking and swizzling, to be 5459 // split between the instruction's voffset and immoffset fields) and soffset 5460 // (the offset that is excluded from bounds checking and swizzling, to go in 5461 // the instruction's soffset field). This function takes the first kind of 5462 // offset and figures out how to split it between voffset and immoffset. 5463 std::pair<Register, unsigned> 5464 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5465 Register OrigOffset) const { 5466 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 5467 Register BaseReg; 5468 unsigned ImmOffset; 5469 const LLT S32 = LLT::scalar(32); 5470 MachineRegisterInfo &MRI = *B.getMRI(); 5471 5472 std::tie(BaseReg, ImmOffset) = 5473 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5474 5475 // If BaseReg is a pointer, convert it to int. 5476 if (MRI.getType(BaseReg).isPointer()) 5477 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5478 5479 // If the immediate value is too big for the immoffset field, put only bits 5480 // that would normally fit in the immoffset field. The remaining value that 5481 // is copied/added for the voffset field is a large power of 2, and it 5482 // stands more chance of being CSEd with the copy/add for another similar 5483 // load/store. 5484 // However, do not do that rounding down if that is a negative 5485 // number, as it appears to be illegal to have a negative offset in the 5486 // vgpr, even if adding the immediate offset makes it positive. 5487 unsigned Overflow = ImmOffset & ~MaxImm; 5488 ImmOffset -= Overflow; 5489 if ((int32_t)Overflow < 0) { 5490 Overflow += ImmOffset; 5491 ImmOffset = 0; 5492 } 5493 5494 if (Overflow != 0) { 5495 if (!BaseReg) { 5496 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5497 } else { 5498 auto OverflowVal = B.buildConstant(S32, Overflow); 5499 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5500 } 5501 } 5502 5503 if (!BaseReg) 5504 BaseReg = B.buildConstant(S32, 0).getReg(0); 5505 5506 return std::pair(BaseReg, ImmOffset); 5507 } 5508 5509 /// Handle register layout difference for f16 images for some subtargets. 5510 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5511 MachineRegisterInfo &MRI, 5512 Register Reg, 5513 bool ImageStore) const { 5514 const LLT S16 = LLT::scalar(16); 5515 const LLT S32 = LLT::scalar(32); 5516 LLT StoreVT = MRI.getType(Reg); 5517 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5518 5519 if (ST.hasUnpackedD16VMem()) { 5520 auto Unmerge = B.buildUnmerge(S16, Reg); 5521 5522 SmallVector<Register, 4> WideRegs; 5523 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5524 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5525 5526 int NumElts = StoreVT.getNumElements(); 5527 5528 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5529 .getReg(0); 5530 } 5531 5532 if (ImageStore && ST.hasImageStoreD16Bug()) { 5533 if (StoreVT.getNumElements() == 2) { 5534 SmallVector<Register, 4> PackedRegs; 5535 Reg = B.buildBitcast(S32, Reg).getReg(0); 5536 PackedRegs.push_back(Reg); 5537 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5538 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5539 .getReg(0); 5540 } 5541 5542 if (StoreVT.getNumElements() == 3) { 5543 SmallVector<Register, 4> PackedRegs; 5544 auto Unmerge = B.buildUnmerge(S16, Reg); 5545 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5546 PackedRegs.push_back(Unmerge.getReg(I)); 5547 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5548 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5549 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5550 } 5551 5552 if (StoreVT.getNumElements() == 4) { 5553 SmallVector<Register, 4> PackedRegs; 5554 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5555 auto Unmerge = B.buildUnmerge(S32, Reg); 5556 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5557 PackedRegs.push_back(Unmerge.getReg(I)); 5558 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5559 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5560 .getReg(0); 5561 } 5562 5563 llvm_unreachable("invalid data type"); 5564 } 5565 5566 if (StoreVT == LLT::fixed_vector(3, S16)) { 5567 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5568 .getReg(0); 5569 } 5570 return Reg; 5571 } 5572 5573 Register AMDGPULegalizerInfo::fixStoreSourceType( 5574 MachineIRBuilder &B, Register VData, bool IsFormat) const { 5575 MachineRegisterInfo *MRI = B.getMRI(); 5576 LLT Ty = MRI->getType(VData); 5577 5578 const LLT S16 = LLT::scalar(16); 5579 5580 // Fixup buffer resources themselves needing to be v4i128. 5581 if (hasBufferRsrcWorkaround(Ty)) 5582 return castBufferRsrcToV4I32(VData, B); 5583 5584 // Fixup illegal register types for i8 stores. 5585 if (Ty == LLT::scalar(8) || Ty == S16) { 5586 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5587 return AnyExt; 5588 } 5589 5590 if (Ty.isVector()) { 5591 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5592 if (IsFormat) 5593 return handleD16VData(B, *MRI, VData); 5594 } 5595 } 5596 5597 return VData; 5598 } 5599 5600 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5601 MachineRegisterInfo &MRI, 5602 MachineIRBuilder &B, 5603 bool IsTyped, 5604 bool IsFormat) const { 5605 Register VData = MI.getOperand(1).getReg(); 5606 LLT Ty = MRI.getType(VData); 5607 LLT EltTy = Ty.getScalarType(); 5608 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5609 const LLT S32 = LLT::scalar(32); 5610 5611 VData = fixStoreSourceType(B, VData, IsFormat); 5612 castBufferRsrcArgToV4I32(MI, B, 2); 5613 Register RSrc = MI.getOperand(2).getReg(); 5614 5615 MachineMemOperand *MMO = *MI.memoperands_begin(); 5616 const int MemSize = MMO->getSize(); 5617 5618 unsigned ImmOffset; 5619 5620 // The typed intrinsics add an immediate after the registers. 5621 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5622 5623 // The struct intrinsic variants add one additional operand over raw. 5624 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5625 Register VIndex; 5626 int OpOffset = 0; 5627 if (HasVIndex) { 5628 VIndex = MI.getOperand(3).getReg(); 5629 OpOffset = 1; 5630 } else { 5631 VIndex = B.buildConstant(S32, 0).getReg(0); 5632 } 5633 5634 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5635 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5636 5637 unsigned Format = 0; 5638 if (IsTyped) { 5639 Format = MI.getOperand(5 + OpOffset).getImm(); 5640 ++OpOffset; 5641 } 5642 5643 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5644 5645 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5646 5647 unsigned Opc; 5648 if (IsTyped) { 5649 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5650 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5651 } else if (IsFormat) { 5652 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5653 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5654 } else { 5655 switch (MemSize) { 5656 case 1: 5657 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5658 break; 5659 case 2: 5660 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5661 break; 5662 default: 5663 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5664 break; 5665 } 5666 } 5667 5668 auto MIB = B.buildInstr(Opc) 5669 .addUse(VData) // vdata 5670 .addUse(RSrc) // rsrc 5671 .addUse(VIndex) // vindex 5672 .addUse(VOffset) // voffset 5673 .addUse(SOffset) // soffset 5674 .addImm(ImmOffset); // offset(imm) 5675 5676 if (IsTyped) 5677 MIB.addImm(Format); 5678 5679 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5680 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5681 .addMemOperand(MMO); 5682 5683 MI.eraseFromParent(); 5684 return true; 5685 } 5686 5687 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5688 Register VIndex, Register VOffset, Register SOffset, 5689 unsigned ImmOffset, unsigned Format, 5690 unsigned AuxiliaryData, MachineMemOperand *MMO, 5691 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5692 auto MIB = B.buildInstr(Opc) 5693 .addDef(LoadDstReg) // vdata 5694 .addUse(RSrc) // rsrc 5695 .addUse(VIndex) // vindex 5696 .addUse(VOffset) // voffset 5697 .addUse(SOffset) // soffset 5698 .addImm(ImmOffset); // offset(imm) 5699 5700 if (IsTyped) 5701 MIB.addImm(Format); 5702 5703 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5704 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5705 .addMemOperand(MMO); 5706 } 5707 5708 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5709 MachineRegisterInfo &MRI, 5710 MachineIRBuilder &B, 5711 bool IsFormat, 5712 bool IsTyped) const { 5713 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5714 MachineMemOperand *MMO = *MI.memoperands_begin(); 5715 const LLT MemTy = MMO->getMemoryType(); 5716 const LLT S32 = LLT::scalar(32); 5717 5718 Register Dst = MI.getOperand(0).getReg(); 5719 5720 Register StatusDst; 5721 int OpOffset = 0; 5722 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5723 bool IsTFE = MI.getNumExplicitDefs() == 2; 5724 if (IsTFE) { 5725 StatusDst = MI.getOperand(1).getReg(); 5726 ++OpOffset; 5727 } 5728 5729 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5730 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 5731 5732 // The typed intrinsics add an immediate after the registers. 5733 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5734 5735 // The struct intrinsic variants add one additional operand over raw. 5736 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 5737 Register VIndex; 5738 if (HasVIndex) { 5739 VIndex = MI.getOperand(3 + OpOffset).getReg(); 5740 ++OpOffset; 5741 } else { 5742 VIndex = B.buildConstant(S32, 0).getReg(0); 5743 } 5744 5745 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5746 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5747 5748 unsigned Format = 0; 5749 if (IsTyped) { 5750 Format = MI.getOperand(5 + OpOffset).getImm(); 5751 ++OpOffset; 5752 } 5753 5754 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5755 unsigned ImmOffset; 5756 5757 LLT Ty = MRI.getType(Dst); 5758 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 5759 // logic doesn't have to handle that case. 5760 if (hasBufferRsrcWorkaround(Ty)) { 5761 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 5762 Dst = MI.getOperand(0).getReg(); 5763 } 5764 LLT EltTy = Ty.getScalarType(); 5765 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5766 const bool Unpacked = ST.hasUnpackedD16VMem(); 5767 5768 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5769 5770 unsigned Opc; 5771 5772 // TODO: Support TFE for typed and narrow loads. 5773 if (IsTyped) { 5774 if (IsTFE) 5775 return false; 5776 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 5777 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 5778 } else if (IsFormat) { 5779 if (IsD16) { 5780 if (IsTFE) 5781 return false; 5782 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 5783 } else { 5784 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5785 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5786 } 5787 } else { 5788 if (IsTFE) 5789 return false; 5790 switch (MemTy.getSizeInBits()) { 5791 case 8: 5792 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 5793 break; 5794 case 16: 5795 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 5796 break; 5797 default: 5798 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 5799 break; 5800 } 5801 } 5802 5803 if (IsTFE) { 5804 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5805 unsigned NumLoadDWords = NumValueDWords + 1; 5806 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5807 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5808 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5809 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5810 if (NumValueDWords == 1) { 5811 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5812 } else { 5813 SmallVector<Register, 5> LoadElts; 5814 for (unsigned I = 0; I != NumValueDWords; ++I) 5815 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5816 LoadElts.push_back(StatusDst); 5817 B.buildUnmerge(LoadElts, LoadDstReg); 5818 LoadElts.truncate(NumValueDWords); 5819 B.buildMergeLikeInstr(Dst, LoadElts); 5820 } 5821 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5822 (IsD16 && !Ty.isVector())) { 5823 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5824 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5825 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5826 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5827 B.buildTrunc(Dst, LoadDstReg); 5828 } else if (Unpacked && IsD16 && Ty.isVector()) { 5829 LLT UnpackedTy = Ty.changeElementSize(32); 5830 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5831 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5832 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5833 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5834 // FIXME: G_TRUNC should work, but legalization currently fails 5835 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 5836 SmallVector<Register, 4> Repack; 5837 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 5838 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5839 B.buildMergeLikeInstr(Dst, Repack); 5840 } else { 5841 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5842 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5843 } 5844 5845 MI.eraseFromParent(); 5846 return true; 5847 } 5848 5849 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 5850 switch (IntrID) { 5851 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 5853 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 5855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 5856 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 5858 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 5860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 5861 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 5863 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 5865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 5866 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 5868 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 5870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 5871 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 5873 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 5875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 5876 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 5878 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 5880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 5881 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 5883 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 5885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 5886 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 5888 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 5890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 5891 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 5893 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 5895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 5896 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 5898 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 5900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 5901 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 5903 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 5905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 5906 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 5908 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 5910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 5911 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 5913 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 5915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5916 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5918 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 5921 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: 5922 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: 5923 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; 5924 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5925 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5926 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5927 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5928 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5929 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5930 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5931 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5933 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 5934 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 5935 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 5936 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; 5937 default: 5938 llvm_unreachable("unhandled atomic opcode"); 5939 } 5940 } 5941 5942 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 5943 MachineIRBuilder &B, 5944 Intrinsic::ID IID) const { 5945 const bool IsCmpSwap = 5946 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 5947 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 5948 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 5949 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 5950 5951 Register Dst = MI.getOperand(0).getReg(); 5952 // Since we don't have 128-bit atomics, we don't need to handle the case of 5953 // p8 argmunents to the atomic itself 5954 Register VData = MI.getOperand(2).getReg(); 5955 5956 Register CmpVal; 5957 int OpOffset = 0; 5958 5959 if (IsCmpSwap) { 5960 CmpVal = MI.getOperand(3).getReg(); 5961 ++OpOffset; 5962 } 5963 5964 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 5965 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 5966 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 5967 5968 // The struct intrinsic variants add one additional operand over raw. 5969 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5970 Register VIndex; 5971 if (HasVIndex) { 5972 VIndex = MI.getOperand(4 + OpOffset).getReg(); 5973 ++OpOffset; 5974 } else { 5975 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 5976 } 5977 5978 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 5979 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 5980 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 5981 5982 MachineMemOperand *MMO = *MI.memoperands_begin(); 5983 5984 unsigned ImmOffset; 5985 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5986 5987 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 5988 .addDef(Dst) 5989 .addUse(VData); // vdata 5990 5991 if (IsCmpSwap) 5992 MIB.addReg(CmpVal); 5993 5994 MIB.addUse(RSrc) // rsrc 5995 .addUse(VIndex) // vindex 5996 .addUse(VOffset) // voffset 5997 .addUse(SOffset) // soffset 5998 .addImm(ImmOffset) // offset(imm) 5999 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 6000 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6001 .addMemOperand(MMO); 6002 6003 MI.eraseFromParent(); 6004 return true; 6005 } 6006 6007 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 6008 /// vector with s16 typed elements. 6009 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 6010 SmallVectorImpl<Register> &PackedAddrs, 6011 unsigned ArgOffset, 6012 const AMDGPU::ImageDimIntrinsicInfo *Intr, 6013 bool IsA16, bool IsG16) { 6014 const LLT S16 = LLT::scalar(16); 6015 const LLT V2S16 = LLT::fixed_vector(2, 16); 6016 auto EndIdx = Intr->VAddrEnd; 6017 6018 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 6019 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6020 if (!SrcOp.isReg()) 6021 continue; // _L to _LZ may have eliminated this. 6022 6023 Register AddrReg = SrcOp.getReg(); 6024 6025 if ((I < Intr->GradientStart) || 6026 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 6027 (I >= Intr->CoordStart && !IsA16)) { 6028 if ((I < Intr->GradientStart) && IsA16 && 6029 (B.getMRI()->getType(AddrReg) == S16)) { 6030 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 6031 // Special handling of bias when A16 is on. Bias is of type half but 6032 // occupies full 32-bit. 6033 PackedAddrs.push_back( 6034 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6035 .getReg(0)); 6036 } else { 6037 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 6038 "Bias needs to be converted to 16 bit in A16 mode"); 6039 // Handle any gradient or coordinate operands that should not be packed 6040 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 6041 PackedAddrs.push_back(AddrReg); 6042 } 6043 } else { 6044 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 6045 // derivatives dx/dh and dx/dv are packed with undef. 6046 if (((I + 1) >= EndIdx) || 6047 ((Intr->NumGradients / 2) % 2 == 1 && 6048 (I == static_cast<unsigned>(Intr->GradientStart + 6049 (Intr->NumGradients / 2) - 1) || 6050 I == static_cast<unsigned>(Intr->GradientStart + 6051 Intr->NumGradients - 1))) || 6052 // Check for _L to _LZ optimization 6053 !MI.getOperand(ArgOffset + I + 1).isReg()) { 6054 PackedAddrs.push_back( 6055 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6056 .getReg(0)); 6057 } else { 6058 PackedAddrs.push_back( 6059 B.buildBuildVector( 6060 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 6061 .getReg(0)); 6062 ++I; 6063 } 6064 } 6065 } 6066 } 6067 6068 /// Convert from separate vaddr components to a single vector address register, 6069 /// and replace the remaining operands with $noreg. 6070 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 6071 int DimIdx, int NumVAddrs) { 6072 const LLT S32 = LLT::scalar(32); 6073 (void)S32; 6074 SmallVector<Register, 8> AddrRegs; 6075 for (int I = 0; I != NumVAddrs; ++I) { 6076 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6077 if (SrcOp.isReg()) { 6078 AddrRegs.push_back(SrcOp.getReg()); 6079 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 6080 } 6081 } 6082 6083 int NumAddrRegs = AddrRegs.size(); 6084 if (NumAddrRegs != 1) { 6085 auto VAddr = 6086 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 6087 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 6088 } 6089 6090 for (int I = 1; I != NumVAddrs; ++I) { 6091 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6092 if (SrcOp.isReg()) 6093 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 6094 } 6095 } 6096 6097 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 6098 /// 6099 /// Depending on the subtarget, load/store with 16-bit element data need to be 6100 /// rewritten to use the low half of 32-bit registers, or directly use a packed 6101 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 6102 /// registers. 6103 /// 6104 /// We don't want to directly select image instructions just yet, but also want 6105 /// to exposes all register repacking to the legalizer/combiners. We also don't 6106 /// want a selected instruction entering RegBankSelect. In order to avoid 6107 /// defining a multitude of intermediate image instructions, directly hack on 6108 /// the intrinsic's arguments. In cases like a16 addresses, this requires 6109 /// padding now unnecessary arguments with $noreg. 6110 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6111 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6112 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 6113 6114 const MachineFunction &MF = *MI.getMF(); 6115 const unsigned NumDefs = MI.getNumExplicitDefs(); 6116 const unsigned ArgOffset = NumDefs + 1; 6117 bool IsTFE = NumDefs == 2; 6118 // We are only processing the operands of d16 image operations on subtargets 6119 // that use the unpacked register layout, or need to repack the TFE result. 6120 6121 // TODO: Do we need to guard against already legalized intrinsics? 6122 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6123 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 6124 6125 MachineRegisterInfo *MRI = B.getMRI(); 6126 const LLT S32 = LLT::scalar(32); 6127 const LLT S16 = LLT::scalar(16); 6128 const LLT V2S16 = LLT::fixed_vector(2, 16); 6129 6130 unsigned DMask = 0; 6131 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 6132 LLT Ty = MRI->getType(VData); 6133 6134 const bool IsAtomicPacked16Bit = 6135 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 6136 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 6137 6138 // Check for 16 bit addresses and pack if true. 6139 LLT GradTy = 6140 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6141 LLT AddrTy = 6142 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 6143 const bool IsG16 = 6144 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 6145 const bool IsA16 = AddrTy == S16; 6146 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; 6147 6148 int DMaskLanes = 0; 6149 if (!BaseOpcode->Atomic) { 6150 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 6151 if (BaseOpcode->Gather4) { 6152 DMaskLanes = 4; 6153 } else if (DMask != 0) { 6154 DMaskLanes = llvm::popcount(DMask); 6155 } else if (!IsTFE && !BaseOpcode->Store) { 6156 // If dmask is 0, this is a no-op load. This can be eliminated. 6157 B.buildUndef(MI.getOperand(0)); 6158 MI.eraseFromParent(); 6159 return true; 6160 } 6161 } 6162 6163 Observer.changingInstr(MI); 6164 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 6165 6166 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 6167 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 6168 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 6169 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 6170 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 6171 6172 // Track that we legalized this 6173 MI.setDesc(B.getTII().get(NewOpcode)); 6174 6175 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 6176 // dmask to be at least 1 otherwise the instruction will fail 6177 if (IsTFE && DMask == 0) { 6178 DMask = 0x1; 6179 DMaskLanes = 1; 6180 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 6181 } 6182 6183 if (BaseOpcode->Atomic) { 6184 Register VData0 = MI.getOperand(2).getReg(); 6185 LLT Ty = MRI->getType(VData0); 6186 6187 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 6188 if (Ty.isVector() && !IsAtomicPacked16Bit) 6189 return false; 6190 6191 if (BaseOpcode->AtomicX2) { 6192 Register VData1 = MI.getOperand(3).getReg(); 6193 // The two values are packed in one register. 6194 LLT PackedTy = LLT::fixed_vector(2, Ty); 6195 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 6196 MI.getOperand(2).setReg(Concat.getReg(0)); 6197 MI.getOperand(3).setReg(AMDGPU::NoRegister); 6198 } 6199 } 6200 6201 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 6202 6203 // Rewrite the addressing register layout before doing anything else. 6204 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6205 // 16 bit gradients are supported, but are tied to the A16 control 6206 // so both gradients and addresses must be 16 bit 6207 return false; 6208 } 6209 6210 if (IsA16 && !ST.hasA16()) { 6211 // A16 not supported 6212 return false; 6213 } 6214 6215 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 6216 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 6217 6218 if (IsA16 || IsG16) { 6219 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 6220 // instructions expect VGPR_32 6221 SmallVector<Register, 4> PackedRegs; 6222 6223 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 6224 6225 // See also below in the non-a16 branch 6226 const bool UseNSA = ST.hasNSAEncoding() && 6227 PackedRegs.size() >= ST.getNSAThreshold(MF) && 6228 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 6229 const bool UsePartialNSA = 6230 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 6231 6232 if (UsePartialNSA) { 6233 // Pack registers that would go over NSAMaxSize into last VAddr register 6234 LLT PackedAddrTy = 6235 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 6236 auto Concat = B.buildConcatVectors( 6237 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 6238 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 6239 PackedRegs.resize(NSAMaxSize); 6240 } else if (!UseNSA && PackedRegs.size() > 1) { 6241 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 6242 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 6243 PackedRegs[0] = Concat.getReg(0); 6244 PackedRegs.resize(1); 6245 } 6246 6247 const unsigned NumPacked = PackedRegs.size(); 6248 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6249 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6250 if (!SrcOp.isReg()) { 6251 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 6252 continue; 6253 } 6254 6255 assert(SrcOp.getReg() != AMDGPU::NoRegister); 6256 6257 if (I - Intr->VAddrStart < NumPacked) 6258 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 6259 else 6260 SrcOp.setReg(AMDGPU::NoRegister); 6261 } 6262 } else { 6263 // If the register allocator cannot place the address registers contiguously 6264 // without introducing moves, then using the non-sequential address encoding 6265 // is always preferable, since it saves VALU instructions and is usually a 6266 // wash in terms of code size or even better. 6267 // 6268 // However, we currently have no way of hinting to the register allocator 6269 // that MIMG addresses should be placed contiguously when it is possible to 6270 // do so, so force non-NSA for the common 2-address case as a heuristic. 6271 // 6272 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 6273 // allocation when possible. 6274 // 6275 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 6276 // set of the remaining addresses. 6277 const bool UseNSA = ST.hasNSAEncoding() && 6278 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 6279 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 6280 const bool UsePartialNSA = 6281 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 6282 6283 if (UsePartialNSA) { 6284 convertImageAddrToPacked(B, MI, 6285 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 6286 Intr->NumVAddrs - NSAMaxSize + 1); 6287 } else if (!UseNSA && Intr->NumVAddrs > 1) { 6288 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6289 Intr->NumVAddrs); 6290 } 6291 } 6292 6293 int Flags = 0; 6294 if (IsA16) 6295 Flags |= 1; 6296 if (IsG16) 6297 Flags |= 2; 6298 MI.addOperand(MachineOperand::CreateImm(Flags)); 6299 6300 if (BaseOpcode->Store) { // No TFE for stores? 6301 // TODO: Handle dmask trim 6302 if (!Ty.isVector() || !IsD16) 6303 return true; 6304 6305 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 6306 if (RepackedReg != VData) { 6307 MI.getOperand(1).setReg(RepackedReg); 6308 } 6309 6310 return true; 6311 } 6312 6313 Register DstReg = MI.getOperand(0).getReg(); 6314 const LLT EltTy = Ty.getScalarType(); 6315 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6316 6317 // Confirm that the return type is large enough for the dmask specified 6318 if (NumElts < DMaskLanes) 6319 return false; 6320 6321 if (NumElts > 4 || DMaskLanes > 4) 6322 return false; 6323 6324 // Image atomic instructions are using DMask to specify how many bits 6325 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). 6326 // DMaskLanes for image atomic has default value '0'. 6327 // We must be sure that atomic variants (especially packed) will not be 6328 // truncated from v2s16 or v4s16 to s16 type. 6329 // 6330 // ChangeElementCount will be needed for image load where Ty is always scalar. 6331 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6332 const LLT AdjustedTy = 6333 DMaskLanes == 0 6334 ? Ty 6335 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6336 6337 // The raw dword aligned data component of the load. The only legal cases 6338 // where this matters should be when using the packed D16 format, for 6339 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6340 LLT RoundedTy; 6341 6342 // S32 vector to cover all data, plus TFE result element. 6343 LLT TFETy; 6344 6345 // Register type to use for each loaded component. Will be S32 or V2S16. 6346 LLT RegTy; 6347 6348 if (IsD16 && ST.hasUnpackedD16VMem()) { 6349 RoundedTy = 6350 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6351 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6352 RegTy = S32; 6353 } else { 6354 unsigned EltSize = EltTy.getSizeInBits(); 6355 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6356 unsigned RoundedSize = 32 * RoundedElts; 6357 RoundedTy = LLT::scalarOrVector( 6358 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6359 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6360 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6361 } 6362 6363 // The return type does not need adjustment. 6364 // TODO: Should we change s16 case to s32 or <2 x s16>? 6365 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6366 return true; 6367 6368 Register Dst1Reg; 6369 6370 // Insert after the instruction. 6371 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6372 6373 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6374 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6375 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6376 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6377 6378 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6379 6380 MI.getOperand(0).setReg(NewResultReg); 6381 6382 // In the IR, TFE is supposed to be used with a 2 element struct return 6383 // type. The instruction really returns these two values in one contiguous 6384 // register, with one additional dword beyond the loaded data. Rewrite the 6385 // return type to use a single register result. 6386 6387 if (IsTFE) { 6388 Dst1Reg = MI.getOperand(1).getReg(); 6389 if (MRI->getType(Dst1Reg) != S32) 6390 return false; 6391 6392 // TODO: Make sure the TFE operand bit is set. 6393 MI.removeOperand(1); 6394 6395 // Handle the easy case that requires no repack instructions. 6396 if (Ty == S32) { 6397 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6398 return true; 6399 } 6400 } 6401 6402 // Now figure out how to copy the new result register back into the old 6403 // result. 6404 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6405 6406 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6407 6408 if (ResultNumRegs == 1) { 6409 assert(!IsTFE); 6410 ResultRegs[0] = NewResultReg; 6411 } else { 6412 // We have to repack into a new vector of some kind. 6413 for (int I = 0; I != NumDataRegs; ++I) 6414 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6415 B.buildUnmerge(ResultRegs, NewResultReg); 6416 6417 // Drop the final TFE element to get the data part. The TFE result is 6418 // directly written to the right place already. 6419 if (IsTFE) 6420 ResultRegs.resize(NumDataRegs); 6421 } 6422 6423 // For an s16 scalar result, we form an s32 result with a truncate regardless 6424 // of packed vs. unpacked. 6425 if (IsD16 && !Ty.isVector()) { 6426 B.buildTrunc(DstReg, ResultRegs[0]); 6427 return true; 6428 } 6429 6430 // Avoid a build/concat_vector of 1 entry. 6431 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6432 B.buildBitcast(DstReg, ResultRegs[0]); 6433 return true; 6434 } 6435 6436 assert(Ty.isVector()); 6437 6438 if (IsD16) { 6439 // For packed D16 results with TFE enabled, all the data components are 6440 // S32. Cast back to the expected type. 6441 // 6442 // TODO: We don't really need to use load s32 elements. We would only need one 6443 // cast for the TFE result if a multiple of v2s16 was used. 6444 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6445 for (Register &Reg : ResultRegs) 6446 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6447 } else if (ST.hasUnpackedD16VMem()) { 6448 for (Register &Reg : ResultRegs) 6449 Reg = B.buildTrunc(S16, Reg).getReg(0); 6450 } 6451 } 6452 6453 auto padWithUndef = [&](LLT Ty, int NumElts) { 6454 if (NumElts == 0) 6455 return; 6456 Register Undef = B.buildUndef(Ty).getReg(0); 6457 for (int I = 0; I != NumElts; ++I) 6458 ResultRegs.push_back(Undef); 6459 }; 6460 6461 // Pad out any elements eliminated due to the dmask. 6462 LLT ResTy = MRI->getType(ResultRegs[0]); 6463 if (!ResTy.isVector()) { 6464 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6465 B.buildBuildVector(DstReg, ResultRegs); 6466 return true; 6467 } 6468 6469 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6470 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6471 6472 // Deal with the one annoying legal case. 6473 const LLT V3S16 = LLT::fixed_vector(3, 16); 6474 if (Ty == V3S16) { 6475 if (IsTFE) { 6476 if (ResultRegs.size() == 1) { 6477 NewResultReg = ResultRegs[0]; 6478 } else if (ResultRegs.size() == 2) { 6479 LLT V4S16 = LLT::fixed_vector(4, 16); 6480 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6481 } else { 6482 return false; 6483 } 6484 } 6485 6486 if (MRI->getType(DstReg).getNumElements() < 6487 MRI->getType(NewResultReg).getNumElements()) { 6488 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6489 } else { 6490 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6491 } 6492 return true; 6493 } 6494 6495 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6496 B.buildConcatVectors(DstReg, ResultRegs); 6497 return true; 6498 } 6499 6500 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, 6501 MachineInstr &MI) const { 6502 MachineIRBuilder &B = Helper.MIRBuilder; 6503 GISelChangeObserver &Observer = Helper.Observer; 6504 6505 Register OrigDst = MI.getOperand(0).getReg(); 6506 Register Dst; 6507 LLT Ty = B.getMRI()->getType(OrigDst); 6508 unsigned Size = Ty.getSizeInBits(); 6509 MachineFunction &MF = B.getMF(); 6510 unsigned Opc = 0; 6511 if (Size < 32 && ST.hasScalarSubwordLoads()) { 6512 assert(Size == 8 || Size == 16); 6513 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE 6514 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; 6515 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit 6516 // destination register. 6517 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 6518 } else { 6519 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; 6520 Dst = OrigDst; 6521 } 6522 6523 Observer.changingInstr(MI); 6524 6525 // Handle needing to s.buffer.load() a p8 value. 6526 if (hasBufferRsrcWorkaround(Ty)) { 6527 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6528 B.setInsertPt(B.getMBB(), MI); 6529 } 6530 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6531 Ty = getBitcastRegisterType(Ty); 6532 Helper.bitcastDst(MI, Ty, 0); 6533 B.setInsertPt(B.getMBB(), MI); 6534 } 6535 6536 // FIXME: We don't really need this intermediate instruction. The intrinsic 6537 // should be fixed to have a memory operand. Since it's readnone, we're not 6538 // allowed to add one. 6539 MI.setDesc(B.getTII().get(Opc)); 6540 MI.removeOperand(1); // Remove intrinsic ID 6541 6542 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6543 // TODO: Should this use datalayout alignment? 6544 const unsigned MemSize = (Size + 7) / 8; 6545 const Align MemAlign(std::min(MemSize, 4u)); 6546 MachineMemOperand *MMO = MF.getMachineMemOperand( 6547 MachinePointerInfo(), 6548 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6549 MachineMemOperand::MOInvariant, 6550 MemSize, MemAlign); 6551 MI.addMemOperand(MF, MMO); 6552 if (Dst != OrigDst) { 6553 MI.getOperand(0).setReg(Dst); 6554 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6555 B.buildTrunc(OrigDst, Dst); 6556 } 6557 6558 // If we don't have 96-bit result scalar loads, widening to 128-bit should 6559 // always be legal. We may need to restore this to a 96-bit result if it turns 6560 // out this needs to be converted to a vector load during RegBankSelect. 6561 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 6562 if (Ty.isVector()) 6563 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6564 else 6565 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6566 } 6567 6568 Observer.changedInstr(MI); 6569 return true; 6570 } 6571 6572 // TODO: Move to selection 6573 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 6574 MachineRegisterInfo &MRI, 6575 MachineIRBuilder &B) const { 6576 if (!ST.isTrapHandlerEnabled() || 6577 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6578 return legalizeTrapEndpgm(MI, MRI, B); 6579 6580 return ST.supportsGetDoorbellID() ? 6581 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6582 } 6583 6584 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6585 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6586 const DebugLoc &DL = MI.getDebugLoc(); 6587 MachineBasicBlock &BB = B.getMBB(); 6588 MachineFunction *MF = BB.getParent(); 6589 6590 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6591 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6592 .addImm(0); 6593 MI.eraseFromParent(); 6594 return true; 6595 } 6596 6597 // We need a block split to make the real endpgm a terminator. We also don't 6598 // want to break phis in successor blocks, so we can't just delete to the 6599 // end of the block. 6600 BB.splitAt(MI, false /*UpdateLiveIns*/); 6601 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6602 MF->push_back(TrapBB); 6603 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6604 .addImm(0); 6605 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6606 .addMBB(TrapBB); 6607 6608 BB.addSuccessor(TrapBB); 6609 MI.eraseFromParent(); 6610 return true; 6611 } 6612 6613 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6614 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6615 MachineFunction &MF = B.getMF(); 6616 const LLT S64 = LLT::scalar(64); 6617 6618 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6619 // For code object version 5, queue_ptr is passed through implicit kernarg. 6620 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 6621 AMDGPU::AMDHSA_COV5) { 6622 AMDGPUTargetLowering::ImplicitParameter Param = 6623 AMDGPUTargetLowering::QUEUE_PTR; 6624 uint64_t Offset = 6625 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6626 6627 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6628 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6629 6630 if (!loadInputValue(KernargPtrReg, B, 6631 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6632 return false; 6633 6634 // TODO: can we be smarter about machine pointer info? 6635 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6636 MachineMemOperand *MMO = MF.getMachineMemOperand( 6637 PtrInfo, 6638 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6639 MachineMemOperand::MOInvariant, 6640 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6641 6642 // Pointer address 6643 Register LoadAddr = MRI.createGenericVirtualRegister( 6644 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6645 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6646 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6647 // Load address 6648 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6649 B.buildCopy(SGPR01, Temp); 6650 B.buildInstr(AMDGPU::S_TRAP) 6651 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6652 .addReg(SGPR01, RegState::Implicit); 6653 MI.eraseFromParent(); 6654 return true; 6655 } 6656 6657 // Pass queue pointer to trap handler as input, and insert trap instruction 6658 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6659 Register LiveIn = 6660 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6661 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6662 return false; 6663 6664 B.buildCopy(SGPR01, LiveIn); 6665 B.buildInstr(AMDGPU::S_TRAP) 6666 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6667 .addReg(SGPR01, RegState::Implicit); 6668 6669 MI.eraseFromParent(); 6670 return true; 6671 } 6672 6673 bool AMDGPULegalizerInfo::legalizeTrapHsa( 6674 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6675 B.buildInstr(AMDGPU::S_TRAP) 6676 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6677 MI.eraseFromParent(); 6678 return true; 6679 } 6680 6681 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 6682 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6683 // Is non-HSA path or trap-handler disabled? Then, report a warning 6684 // accordingly 6685 if (!ST.isTrapHandlerEnabled() || 6686 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6687 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 6688 "debugtrap handler not supported", 6689 MI.getDebugLoc(), DS_Warning); 6690 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 6691 Ctx.diagnose(NoTrap); 6692 } else { 6693 // Insert debug-trap instruction 6694 B.buildInstr(AMDGPU::S_TRAP) 6695 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 6696 } 6697 6698 MI.eraseFromParent(); 6699 return true; 6700 } 6701 6702 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6703 MachineIRBuilder &B) const { 6704 MachineRegisterInfo &MRI = *B.getMRI(); 6705 const LLT S16 = LLT::scalar(16); 6706 const LLT S32 = LLT::scalar(32); 6707 const LLT V2S16 = LLT::fixed_vector(2, 16); 6708 const LLT V3S32 = LLT::fixed_vector(3, 32); 6709 6710 Register DstReg = MI.getOperand(0).getReg(); 6711 Register NodePtr = MI.getOperand(2).getReg(); 6712 Register RayExtent = MI.getOperand(3).getReg(); 6713 Register RayOrigin = MI.getOperand(4).getReg(); 6714 Register RayDir = MI.getOperand(5).getReg(); 6715 Register RayInvDir = MI.getOperand(6).getReg(); 6716 Register TDescr = MI.getOperand(7).getReg(); 6717 6718 if (!ST.hasGFX10_AEncoding()) { 6719 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6720 "intrinsic not supported on subtarget", 6721 MI.getDebugLoc()); 6722 B.getMF().getFunction().getContext().diagnose(BadIntrin); 6723 return false; 6724 } 6725 6726 const bool IsGFX11 = AMDGPU::isGFX11(ST); 6727 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6728 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 6729 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6730 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6731 const unsigned NumVDataDwords = 4; 6732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 6733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 6734 const bool UseNSA = 6735 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 6736 6737 const unsigned BaseOpcodes[2][2] = { 6738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6741 int Opcode; 6742 if (UseNSA) { 6743 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 6745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 6746 : AMDGPU::MIMGEncGfx10NSA, 6747 NumVDataDwords, NumVAddrDwords); 6748 } else { 6749 assert(!IsGFX12Plus); 6750 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 6752 : AMDGPU::MIMGEncGfx10Default, 6753 NumVDataDwords, NumVAddrDwords); 6754 } 6755 assert(Opcode != -1); 6756 6757 SmallVector<Register, 12> Ops; 6758 if (UseNSA && IsGFX11Plus) { 6759 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 6760 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6761 auto Merged = B.buildMergeLikeInstr( 6762 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 6763 Ops.push_back(Merged.getReg(0)); 6764 }; 6765 6766 Ops.push_back(NodePtr); 6767 Ops.push_back(RayExtent); 6768 packLanes(RayOrigin); 6769 6770 if (IsA16) { 6771 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6772 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6773 auto MergedDir = B.buildMergeLikeInstr( 6774 V3S32, 6775 {B.buildBitcast( 6776 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 6777 UnmergeRayDir.getReg(0)})) 6778 .getReg(0), 6779 B.buildBitcast( 6780 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 6781 UnmergeRayDir.getReg(1)})) 6782 .getReg(0), 6783 B.buildBitcast( 6784 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 6785 UnmergeRayDir.getReg(2)})) 6786 .getReg(0)}); 6787 Ops.push_back(MergedDir.getReg(0)); 6788 } else { 6789 packLanes(RayDir); 6790 packLanes(RayInvDir); 6791 } 6792 } else { 6793 if (Is64) { 6794 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6795 Ops.push_back(Unmerge.getReg(0)); 6796 Ops.push_back(Unmerge.getReg(1)); 6797 } else { 6798 Ops.push_back(NodePtr); 6799 } 6800 Ops.push_back(RayExtent); 6801 6802 auto packLanes = [&Ops, &S32, &B](Register Src) { 6803 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6804 Ops.push_back(Unmerge.getReg(0)); 6805 Ops.push_back(Unmerge.getReg(1)); 6806 Ops.push_back(Unmerge.getReg(2)); 6807 }; 6808 6809 packLanes(RayOrigin); 6810 if (IsA16) { 6811 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6812 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6813 Register R1 = MRI.createGenericVirtualRegister(S32); 6814 Register R2 = MRI.createGenericVirtualRegister(S32); 6815 Register R3 = MRI.createGenericVirtualRegister(S32); 6816 B.buildMergeLikeInstr(R1, 6817 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6818 B.buildMergeLikeInstr( 6819 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6820 B.buildMergeLikeInstr( 6821 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6822 Ops.push_back(R1); 6823 Ops.push_back(R2); 6824 Ops.push_back(R3); 6825 } else { 6826 packLanes(RayDir); 6827 packLanes(RayInvDir); 6828 } 6829 } 6830 6831 if (!UseNSA) { 6832 // Build a single vector containing all the operands so far prepared. 6833 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6834 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6835 Ops.clear(); 6836 Ops.push_back(MergedOps); 6837 } 6838 6839 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6840 .addDef(DstReg) 6841 .addImm(Opcode); 6842 6843 for (Register R : Ops) { 6844 MIB.addUse(R); 6845 } 6846 6847 MIB.addUse(TDescr) 6848 .addImm(IsA16 ? 1 : 0) 6849 .cloneMemRefs(MI); 6850 6851 MI.eraseFromParent(); 6852 return true; 6853 } 6854 6855 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 6856 MachineIRBuilder &B) const { 6857 unsigned Opc; 6858 int RoundMode = MI.getOperand(2).getImm(); 6859 6860 if (RoundMode == (int)RoundingMode::TowardPositive) 6861 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 6862 else if (RoundMode == (int)RoundingMode::TowardNegative) 6863 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 6864 else 6865 return false; 6866 6867 B.buildInstr(Opc) 6868 .addDef(MI.getOperand(0).getReg()) 6869 .addUse(MI.getOperand(1).getReg()); 6870 6871 MI.eraseFromParent(); 6872 6873 return true; 6874 } 6875 6876 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 6877 MachineIRBuilder &B) const { 6878 const SITargetLowering *TLI = ST.getTargetLowering(); 6879 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 6880 Register DstReg = MI.getOperand(0).getReg(); 6881 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 6882 MI.eraseFromParent(); 6883 return true; 6884 } 6885 6886 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, 6887 MachineIRBuilder &B) const { 6888 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 6889 if (!ST.hasArchitectedSGPRs()) 6890 return false; 6891 LLT S32 = LLT::scalar(32); 6892 Register DstReg = MI.getOperand(0).getReg(); 6893 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); 6894 auto LSB = B.buildConstant(S32, 25); 6895 auto Width = B.buildConstant(S32, 5); 6896 B.buildUbfx(DstReg, TTMP8, LSB, Width); 6897 MI.eraseFromParent(); 6898 return true; 6899 } 6900 6901 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 6902 MachineInstr &MI) const { 6903 MachineIRBuilder &B = Helper.MIRBuilder; 6904 MachineRegisterInfo &MRI = *B.getMRI(); 6905 6906 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 6907 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 6908 switch (IntrID) { 6909 case Intrinsic::amdgcn_if: 6910 case Intrinsic::amdgcn_else: { 6911 MachineInstr *Br = nullptr; 6912 MachineBasicBlock *UncondBrTarget = nullptr; 6913 bool Negated = false; 6914 if (MachineInstr *BrCond = 6915 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6916 const SIRegisterInfo *TRI 6917 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6918 6919 Register Def = MI.getOperand(1).getReg(); 6920 Register Use = MI.getOperand(3).getReg(); 6921 6922 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6923 6924 if (Negated) 6925 std::swap(CondBrTarget, UncondBrTarget); 6926 6927 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6928 if (IntrID == Intrinsic::amdgcn_if) { 6929 B.buildInstr(AMDGPU::SI_IF) 6930 .addDef(Def) 6931 .addUse(Use) 6932 .addMBB(UncondBrTarget); 6933 } else { 6934 B.buildInstr(AMDGPU::SI_ELSE) 6935 .addDef(Def) 6936 .addUse(Use) 6937 .addMBB(UncondBrTarget); 6938 } 6939 6940 if (Br) { 6941 Br->getOperand(0).setMBB(CondBrTarget); 6942 } else { 6943 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 6944 // since we're swapping branch targets it needs to be reinserted. 6945 // FIXME: IRTranslator should probably not do this 6946 B.buildBr(*CondBrTarget); 6947 } 6948 6949 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 6950 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 6951 MI.eraseFromParent(); 6952 BrCond->eraseFromParent(); 6953 return true; 6954 } 6955 6956 return false; 6957 } 6958 case Intrinsic::amdgcn_loop: { 6959 MachineInstr *Br = nullptr; 6960 MachineBasicBlock *UncondBrTarget = nullptr; 6961 bool Negated = false; 6962 if (MachineInstr *BrCond = 6963 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6964 const SIRegisterInfo *TRI 6965 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6966 6967 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6968 Register Reg = MI.getOperand(2).getReg(); 6969 6970 if (Negated) 6971 std::swap(CondBrTarget, UncondBrTarget); 6972 6973 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6974 B.buildInstr(AMDGPU::SI_LOOP) 6975 .addUse(Reg) 6976 .addMBB(UncondBrTarget); 6977 6978 if (Br) 6979 Br->getOperand(0).setMBB(CondBrTarget); 6980 else 6981 B.buildBr(*CondBrTarget); 6982 6983 MI.eraseFromParent(); 6984 BrCond->eraseFromParent(); 6985 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 6986 return true; 6987 } 6988 6989 return false; 6990 } 6991 case Intrinsic::amdgcn_make_buffer_rsrc: 6992 return legalizePointerAsRsrcIntrin(MI, MRI, B); 6993 case Intrinsic::amdgcn_kernarg_segment_ptr: 6994 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 6995 // This only makes sense to call in a kernel, so just lower to null. 6996 B.buildConstant(MI.getOperand(0).getReg(), 0); 6997 MI.eraseFromParent(); 6998 return true; 6999 } 7000 7001 return legalizePreloadedArgIntrin( 7002 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 7003 case Intrinsic::amdgcn_implicitarg_ptr: 7004 return legalizeImplicitArgPtr(MI, MRI, B); 7005 case Intrinsic::amdgcn_workitem_id_x: 7006 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 7007 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 7008 case Intrinsic::amdgcn_workitem_id_y: 7009 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 7010 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 7011 case Intrinsic::amdgcn_workitem_id_z: 7012 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 7013 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 7014 case Intrinsic::amdgcn_workgroup_id_x: 7015 return legalizePreloadedArgIntrin(MI, MRI, B, 7016 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 7017 case Intrinsic::amdgcn_workgroup_id_y: 7018 return legalizePreloadedArgIntrin(MI, MRI, B, 7019 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 7020 case Intrinsic::amdgcn_workgroup_id_z: 7021 return legalizePreloadedArgIntrin(MI, MRI, B, 7022 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 7023 case Intrinsic::amdgcn_wave_id: 7024 return legalizeWaveID(MI, B); 7025 case Intrinsic::amdgcn_lds_kernel_id: 7026 return legalizePreloadedArgIntrin(MI, MRI, B, 7027 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 7028 case Intrinsic::amdgcn_dispatch_ptr: 7029 return legalizePreloadedArgIntrin(MI, MRI, B, 7030 AMDGPUFunctionArgInfo::DISPATCH_PTR); 7031 case Intrinsic::amdgcn_queue_ptr: 7032 return legalizePreloadedArgIntrin(MI, MRI, B, 7033 AMDGPUFunctionArgInfo::QUEUE_PTR); 7034 case Intrinsic::amdgcn_implicit_buffer_ptr: 7035 return legalizePreloadedArgIntrin( 7036 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 7037 case Intrinsic::amdgcn_dispatch_id: 7038 return legalizePreloadedArgIntrin(MI, MRI, B, 7039 AMDGPUFunctionArgInfo::DISPATCH_ID); 7040 case Intrinsic::r600_read_ngroups_x: 7041 // TODO: Emit error for hsa 7042 return legalizeKernargMemParameter(MI, B, 7043 SI::KernelInputOffsets::NGROUPS_X); 7044 case Intrinsic::r600_read_ngroups_y: 7045 return legalizeKernargMemParameter(MI, B, 7046 SI::KernelInputOffsets::NGROUPS_Y); 7047 case Intrinsic::r600_read_ngroups_z: 7048 return legalizeKernargMemParameter(MI, B, 7049 SI::KernelInputOffsets::NGROUPS_Z); 7050 case Intrinsic::r600_read_local_size_x: 7051 // TODO: Could insert G_ASSERT_ZEXT from s16 7052 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 7053 case Intrinsic::r600_read_local_size_y: 7054 // TODO: Could insert G_ASSERT_ZEXT from s16 7055 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 7056 // TODO: Could insert G_ASSERT_ZEXT from s16 7057 case Intrinsic::r600_read_local_size_z: 7058 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 7059 case Intrinsic::r600_read_global_size_x: 7060 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 7061 case Intrinsic::r600_read_global_size_y: 7062 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 7063 case Intrinsic::r600_read_global_size_z: 7064 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 7065 case Intrinsic::amdgcn_fdiv_fast: 7066 return legalizeFDIVFastIntrin(MI, MRI, B); 7067 case Intrinsic::amdgcn_is_shared: 7068 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 7069 case Intrinsic::amdgcn_is_private: 7070 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 7071 case Intrinsic::amdgcn_wavefrontsize: { 7072 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 7073 MI.eraseFromParent(); 7074 return true; 7075 } 7076 case Intrinsic::amdgcn_s_buffer_load: 7077 return legalizeSBufferLoad(Helper, MI); 7078 case Intrinsic::amdgcn_raw_buffer_store: 7079 case Intrinsic::amdgcn_raw_ptr_buffer_store: 7080 case Intrinsic::amdgcn_struct_buffer_store: 7081 case Intrinsic::amdgcn_struct_ptr_buffer_store: 7082 return legalizeBufferStore(MI, MRI, B, false, false); 7083 case Intrinsic::amdgcn_raw_buffer_store_format: 7084 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 7085 case Intrinsic::amdgcn_struct_buffer_store_format: 7086 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 7087 return legalizeBufferStore(MI, MRI, B, false, true); 7088 case Intrinsic::amdgcn_raw_tbuffer_store: 7089 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 7090 case Intrinsic::amdgcn_struct_tbuffer_store: 7091 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 7092 return legalizeBufferStore(MI, MRI, B, true, true); 7093 case Intrinsic::amdgcn_raw_buffer_load: 7094 case Intrinsic::amdgcn_raw_ptr_buffer_load: 7095 case Intrinsic::amdgcn_struct_buffer_load: 7096 case Intrinsic::amdgcn_struct_ptr_buffer_load: 7097 return legalizeBufferLoad(MI, MRI, B, false, false); 7098 case Intrinsic::amdgcn_raw_buffer_load_format: 7099 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 7100 case Intrinsic::amdgcn_struct_buffer_load_format: 7101 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 7102 return legalizeBufferLoad(MI, MRI, B, true, false); 7103 case Intrinsic::amdgcn_raw_tbuffer_load: 7104 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 7105 case Intrinsic::amdgcn_struct_tbuffer_load: 7106 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 7107 return legalizeBufferLoad(MI, MRI, B, true, true); 7108 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 7109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 7110 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 7111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 7112 case Intrinsic::amdgcn_raw_buffer_atomic_add: 7113 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 7114 case Intrinsic::amdgcn_struct_buffer_atomic_add: 7115 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 7116 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 7117 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 7118 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 7119 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 7120 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 7121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 7122 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 7123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 7124 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 7125 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 7126 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 7127 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 7128 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 7129 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 7130 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 7131 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 7132 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 7133 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 7134 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 7135 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 7136 case Intrinsic::amdgcn_raw_buffer_atomic_and: 7137 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 7138 case Intrinsic::amdgcn_struct_buffer_atomic_and: 7139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 7140 case Intrinsic::amdgcn_raw_buffer_atomic_or: 7141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 7142 case Intrinsic::amdgcn_struct_buffer_atomic_or: 7143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 7144 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 7145 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 7146 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 7147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 7148 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 7149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 7150 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 7151 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 7152 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 7153 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 7154 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 7155 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 7156 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 7157 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 7158 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 7159 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7160 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 7161 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7162 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 7163 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7164 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 7165 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7166 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 7167 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 7168 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 7169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7170 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 7171 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 7172 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: 7173 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: 7174 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: 7175 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: 7176 return legalizeBufferAtomic(MI, B, IntrID); 7177 case Intrinsic::trap: 7178 return legalizeTrapIntrinsic(MI, MRI, B); 7179 case Intrinsic::debugtrap: 7180 return legalizeDebugTrapIntrinsic(MI, MRI, B); 7181 case Intrinsic::amdgcn_rsq_clamp: 7182 return legalizeRsqClampIntrinsic(MI, MRI, B); 7183 case Intrinsic::amdgcn_ds_fadd: 7184 case Intrinsic::amdgcn_ds_fmin: 7185 case Intrinsic::amdgcn_ds_fmax: 7186 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 7187 case Intrinsic::amdgcn_image_bvh_intersect_ray: 7188 return legalizeBVHIntrinsic(MI, B); 7189 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 7190 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 7191 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 7192 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 7193 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 7194 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 7195 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 7196 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 7197 Register Index = MI.getOperand(5).getReg(); 7198 LLT S32 = LLT::scalar(32); 7199 if (MRI.getType(Index) != S32) 7200 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7201 return true; 7202 } 7203 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 7204 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 7205 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 7206 Register Index = MI.getOperand(7).getReg(); 7207 LLT S32 = LLT::scalar(32); 7208 if (MRI.getType(Index) != S32) 7209 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7210 return true; 7211 } 7212 case Intrinsic::amdgcn_fmed3: { 7213 GISelChangeObserver &Observer = Helper.Observer; 7214 7215 // FIXME: This is to workaround the inability of tablegen match combiners to 7216 // match intrinsics in patterns. 7217 Observer.changingInstr(MI); 7218 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 7219 MI.removeOperand(1); 7220 Observer.changedInstr(MI); 7221 return true; 7222 } 7223 default: { 7224 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 7225 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 7226 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 7227 return true; 7228 } 7229 } 7230 7231 return true; 7232 } 7233