1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/CodeGen/GlobalISel/Utils.h" 28 #include "llvm/IR/DiagnosticInfo.h" 29 #include "llvm/IR/IntrinsicsAMDGPU.h" 30 #include "llvm/IR/IntrinsicsR600.h" 31 32 #define DEBUG_TYPE "amdgpu-legalinfo" 33 34 using namespace llvm; 35 using namespace LegalizeActions; 36 using namespace LegalizeMutations; 37 using namespace LegalityPredicates; 38 using namespace MIPatternMatch; 39 40 // Hack until load/store selection patterns support any tuple of legal types. 41 static cl::opt<bool> EnableNewLegality( 42 "amdgpu-global-isel-new-legality", 43 cl::desc("Use GlobalISel desired legality, rather than try to use" 44 "rules compatible with selection patterns"), 45 cl::init(false), 46 cl::ReallyHidden); 47 48 static constexpr unsigned MaxRegisterSize = 1024; 49 50 // Round the number of elements to the next power of two elements 51 static LLT getPow2VectorType(LLT Ty) { 52 unsigned NElts = Ty.getNumElements(); 53 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 54 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 55 } 56 57 // Round the number of bits to the next power of two bits 58 static LLT getPow2ScalarType(LLT Ty) { 59 unsigned Bits = Ty.getSizeInBits(); 60 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 61 return LLT::scalar(Pow2Bits); 62 } 63 64 /// \returns true if this is an odd sized vector which should widen by adding an 65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 66 /// excludes s1 vectors, which should always be scalarized. 67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 68 return [=](const LegalityQuery &Query) { 69 const LLT Ty = Query.Types[TypeIdx]; 70 if (!Ty.isVector()) 71 return false; 72 73 const LLT EltTy = Ty.getElementType(); 74 const unsigned EltSize = EltTy.getSizeInBits(); 75 return Ty.getNumElements() % 2 != 0 && 76 EltSize > 1 && EltSize < 32 && 77 Ty.getSizeInBits() % 32 != 0; 78 }; 79 } 80 81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 82 return [=](const LegalityQuery &Query) { 83 const LLT Ty = Query.Types[TypeIdx]; 84 return Ty.getSizeInBits() % 32 == 0; 85 }; 86 } 87 88 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 89 return [=](const LegalityQuery &Query) { 90 const LLT Ty = Query.Types[TypeIdx]; 91 const LLT EltTy = Ty.getScalarType(); 92 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 93 }; 94 } 95 96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 97 return [=](const LegalityQuery &Query) { 98 const LLT Ty = Query.Types[TypeIdx]; 99 const LLT EltTy = Ty.getElementType(); 100 return std::pair(TypeIdx, 101 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 102 }; 103 } 104 105 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 106 return [=](const LegalityQuery &Query) { 107 const LLT Ty = Query.Types[TypeIdx]; 108 const LLT EltTy = Ty.getElementType(); 109 unsigned Size = Ty.getSizeInBits(); 110 unsigned Pieces = (Size + 63) / 64; 111 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 112 return std::pair(TypeIdx, LLT::scalarOrVector( 113 ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 // Increase the number of vector elements to reach the next legal RegClass. 136 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 137 return [=](const LegalityQuery &Query) { 138 const LLT Ty = Query.Types[TypeIdx]; 139 const unsigned NumElts = Ty.getNumElements(); 140 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 141 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 142 143 assert(EltSize == 32 || EltSize == 64); 144 assert(Ty.getSizeInBits() < MaxRegisterSize); 145 146 unsigned NewNumElts; 147 // Find the nearest legal RegClass that is larger than the current type. 148 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 149 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 150 break; 151 } 152 153 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 154 }; 155 } 156 157 static LLT getBufferRsrcScalarType(const LLT Ty) { 158 if (!Ty.isVector()) 159 return LLT::scalar(128); 160 const ElementCount NumElems = Ty.getElementCount(); 161 return LLT::vector(NumElems, LLT::scalar(128)); 162 } 163 164 static LLT getBufferRsrcRegisterType(const LLT Ty) { 165 if (!Ty.isVector()) 166 return LLT::fixed_vector(4, LLT::scalar(32)); 167 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 168 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 169 } 170 171 static LLT getBitcastRegisterType(const LLT Ty) { 172 const unsigned Size = Ty.getSizeInBits(); 173 174 if (Size <= 32) { 175 // <2 x s8> -> s16 176 // <4 x s8> -> s32 177 return LLT::scalar(Size); 178 } 179 180 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 181 } 182 183 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 184 return [=](const LegalityQuery &Query) { 185 const LLT Ty = Query.Types[TypeIdx]; 186 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 187 }; 188 } 189 190 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 191 return [=](const LegalityQuery &Query) { 192 const LLT Ty = Query.Types[TypeIdx]; 193 unsigned Size = Ty.getSizeInBits(); 194 assert(Size % 32 == 0); 195 return std::pair( 196 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 197 }; 198 } 199 200 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 201 return [=](const LegalityQuery &Query) { 202 const LLT QueryTy = Query.Types[TypeIdx]; 203 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 204 }; 205 } 206 207 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 208 return [=](const LegalityQuery &Query) { 209 const LLT QueryTy = Query.Types[TypeIdx]; 210 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 211 }; 212 } 213 214 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 215 return [=](const LegalityQuery &Query) { 216 const LLT QueryTy = Query.Types[TypeIdx]; 217 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 218 }; 219 } 220 221 static bool isRegisterSize(unsigned Size) { 222 return Size % 32 == 0 && Size <= MaxRegisterSize; 223 } 224 225 static bool isRegisterVectorElementType(LLT EltTy) { 226 const int EltSize = EltTy.getSizeInBits(); 227 return EltSize == 16 || EltSize % 32 == 0; 228 } 229 230 static bool isRegisterVectorType(LLT Ty) { 231 const int EltSize = Ty.getElementType().getSizeInBits(); 232 return EltSize == 32 || EltSize == 64 || 233 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 234 EltSize == 128 || EltSize == 256; 235 } 236 237 static bool isRegisterType(LLT Ty) { 238 if (!isRegisterSize(Ty.getSizeInBits())) 239 return false; 240 241 if (Ty.isVector()) 242 return isRegisterVectorType(Ty); 243 244 return true; 245 } 246 247 // Any combination of 32 or 64-bit elements up the maximum register size, and 248 // multiples of v2s16. 249 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 250 return [=](const LegalityQuery &Query) { 251 return isRegisterType(Query.Types[TypeIdx]); 252 }; 253 } 254 255 // RegisterType that doesn't have a corresponding RegClass. 256 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 257 return [=](const LegalityQuery &Query) { 258 LLT Ty = Query.Types[TypeIdx]; 259 return isRegisterType(Ty) && 260 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 261 }; 262 } 263 264 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 265 return [=](const LegalityQuery &Query) { 266 const LLT QueryTy = Query.Types[TypeIdx]; 267 if (!QueryTy.isVector()) 268 return false; 269 const LLT EltTy = QueryTy.getElementType(); 270 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 271 }; 272 } 273 274 // If we have a truncating store or an extending load with a data size larger 275 // than 32-bits, we need to reduce to a 32-bit type. 276 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 277 return [=](const LegalityQuery &Query) { 278 const LLT Ty = Query.Types[TypeIdx]; 279 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 280 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 281 }; 282 } 283 284 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 285 // handle some operations by just promoting the register during 286 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 287 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 288 bool IsLoad, bool IsAtomic) { 289 switch (AS) { 290 case AMDGPUAS::PRIVATE_ADDRESS: 291 // FIXME: Private element size. 292 return ST.enableFlatScratch() ? 128 : 32; 293 case AMDGPUAS::LOCAL_ADDRESS: 294 return ST.useDS128() ? 128 : 64; 295 case AMDGPUAS::GLOBAL_ADDRESS: 296 case AMDGPUAS::CONSTANT_ADDRESS: 297 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 298 case AMDGPUAS::BUFFER_RESOURCE: 299 // Treat constant and global as identical. SMRD loads are sometimes usable for 300 // global loads (ideally constant address space should be eliminated) 301 // depending on the context. Legality cannot be context dependent, but 302 // RegBankSelect can split the load as necessary depending on the pointer 303 // register bank/uniformity and if the memory is invariant or not written in a 304 // kernel. 305 return IsLoad ? 512 : 128; 306 default: 307 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 308 // if they may alias scratch depending on the subtarget. This needs to be 309 // moved to custom handling to use addressMayBeAccessedAsPrivate 310 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 311 } 312 } 313 314 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 315 const LegalityQuery &Query) { 316 const LLT Ty = Query.Types[0]; 317 318 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 319 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 320 321 unsigned RegSize = Ty.getSizeInBits(); 322 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 323 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 324 unsigned AS = Query.Types[1].getAddressSpace(); 325 326 // All of these need to be custom lowered to cast the pointer operand. 327 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 328 return false; 329 330 // Do not handle extending vector loads. 331 if (Ty.isVector() && MemSize != RegSize) 332 return false; 333 334 // TODO: We should be able to widen loads if the alignment is high enough, but 335 // we also need to modify the memory access size. 336 #if 0 337 // Accept widening loads based on alignment. 338 if (IsLoad && MemSize < Size) 339 MemSize = std::max(MemSize, Align); 340 #endif 341 342 // Only 1-byte and 2-byte to 32-bit extloads are valid. 343 if (MemSize != RegSize && RegSize != 32) 344 return false; 345 346 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 347 Query.MMODescrs[0].Ordering != 348 AtomicOrdering::NotAtomic)) 349 return false; 350 351 switch (MemSize) { 352 case 8: 353 case 16: 354 case 32: 355 case 64: 356 case 128: 357 break; 358 case 96: 359 if (!ST.hasDwordx3LoadStores()) 360 return false; 361 break; 362 case 256: 363 case 512: 364 // These may contextually need to be broken down. 365 break; 366 default: 367 return false; 368 } 369 370 assert(RegSize >= MemSize); 371 372 if (AlignBits < MemSize) { 373 const SITargetLowering *TLI = ST.getTargetLowering(); 374 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 375 Align(AlignBits / 8))) 376 return false; 377 } 378 379 return true; 380 } 381 382 // The newer buffer intrinsic forms take their resource arguments as 383 // pointers in address space 8, aka s128 values. However, in order to not break 384 // SelectionDAG, the underlying operations have to continue to take v4i32 385 // arguments. Therefore, we convert resource pointers - or vectors of them 386 // to integer values here. 387 static bool hasBufferRsrcWorkaround(const LLT Ty) { 388 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 389 return true; 390 if (Ty.isVector()) { 391 const LLT ElemTy = Ty.getElementType(); 392 return hasBufferRsrcWorkaround(ElemTy); 393 } 394 return false; 395 } 396 397 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 398 // workaround this. Eventually it should ignore the type for loads and only care 399 // about the size. Return true in cases where we will workaround this for now by 400 // bitcasting. 401 static bool loadStoreBitcastWorkaround(const LLT Ty) { 402 if (EnableNewLegality) 403 return false; 404 405 const unsigned Size = Ty.getSizeInBits(); 406 if (Size <= 64) 407 return false; 408 // Address space 8 pointers get their own workaround. 409 if (hasBufferRsrcWorkaround(Ty)) 410 return false; 411 if (!Ty.isVector()) 412 return true; 413 414 LLT EltTy = Ty.getElementType(); 415 if (EltTy.isPointer()) 416 return true; 417 418 unsigned EltSize = EltTy.getSizeInBits(); 419 return EltSize != 32 && EltSize != 64; 420 } 421 422 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 423 const LLT Ty = Query.Types[0]; 424 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 425 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 426 } 427 428 /// Return true if a load or store of the type should be lowered with a bitcast 429 /// to a different type. 430 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 431 const LLT MemTy) { 432 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 433 const unsigned Size = Ty.getSizeInBits(); 434 if (Size != MemSizeInBits) 435 return Size <= 32 && Ty.isVector(); 436 437 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 438 return true; 439 440 // Don't try to handle bitcasting vector ext loads for now. 441 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 442 (Size <= 32 || isRegisterSize(Size)) && 443 !isRegisterVectorElementType(Ty.getElementType()); 444 } 445 446 /// Return true if we should legalize a load by widening an odd sized memory 447 /// access up to the alignment. Note this case when the memory access itself 448 /// changes, not the size of the result register. 449 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 450 uint64_t AlignInBits, unsigned AddrSpace, 451 unsigned Opcode) { 452 unsigned SizeInBits = MemoryTy.getSizeInBits(); 453 // We don't want to widen cases that are naturally legal. 454 if (isPowerOf2_32(SizeInBits)) 455 return false; 456 457 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 458 // end up widening these for a scalar load during RegBankSelect, since there 459 // aren't 96-bit scalar loads. 460 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 461 return false; 462 463 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 464 return false; 465 466 // A load is known dereferenceable up to the alignment, so it's legal to widen 467 // to it. 468 // 469 // TODO: Could check dereferenceable for less aligned cases. 470 unsigned RoundedSize = NextPowerOf2(SizeInBits); 471 if (AlignInBits < RoundedSize) 472 return false; 473 474 // Do not widen if it would introduce a slow unaligned load. 475 const SITargetLowering *TLI = ST.getTargetLowering(); 476 unsigned Fast = 0; 477 return TLI->allowsMisalignedMemoryAccessesImpl( 478 RoundedSize, AddrSpace, Align(AlignInBits / 8), 479 MachineMemOperand::MOLoad, &Fast) && 480 Fast; 481 } 482 483 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 484 unsigned Opcode) { 485 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 486 return false; 487 488 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 489 Query.MMODescrs[0].AlignInBits, 490 Query.Types[1].getAddressSpace(), Opcode); 491 } 492 493 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 494 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 495 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 496 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 497 MachineRegisterInfo &MRI, unsigned Idx) { 498 MachineOperand &MO = MI.getOperand(Idx); 499 500 const LLT PointerTy = MRI.getType(MO.getReg()); 501 502 // Paranoidly prevent us from doing this multiple times. 503 if (!hasBufferRsrcWorkaround(PointerTy)) 504 return PointerTy; 505 506 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 507 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 508 if (!PointerTy.isVector()) { 509 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 510 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 511 const LLT S32 = LLT::scalar(32); 512 513 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 514 std::array<Register, 4> VectorElems; 515 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 516 for (unsigned I = 0; I < NumParts; ++I) 517 VectorElems[I] = 518 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 519 B.buildMergeValues(MO, VectorElems); 520 MO.setReg(VectorReg); 521 return VectorTy; 522 } 523 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 524 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 525 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 526 B.buildIntToPtr(MO, Scalar); 527 MO.setReg(BitcastReg); 528 529 return VectorTy; 530 } 531 532 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 533 /// the form in which the value must be in order to be passed to the low-level 534 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 535 /// needed in order to account for the fact that we can't define a register 536 /// class for s128 without breaking SelectionDAG. 537 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 538 MachineRegisterInfo &MRI = *B.getMRI(); 539 const LLT PointerTy = MRI.getType(Pointer); 540 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 541 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 542 543 if (!PointerTy.isVector()) { 544 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 545 SmallVector<Register, 4> PointerParts; 546 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 547 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 548 for (unsigned I = 0; I < NumParts; ++I) 549 PointerParts.push_back(Unmerged.getReg(I)); 550 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 551 } 552 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 553 return B.buildBitcast(VectorTy, Scalar).getReg(0); 554 } 555 556 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 557 unsigned Idx) { 558 MachineOperand &MO = MI.getOperand(Idx); 559 560 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 561 // Paranoidly prevent us from doing this multiple times. 562 if (!hasBufferRsrcWorkaround(PointerTy)) 563 return; 564 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 565 } 566 567 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 568 const GCNTargetMachine &TM) 569 : ST(ST_) { 570 using namespace TargetOpcode; 571 572 auto GetAddrSpacePtr = [&TM](unsigned AS) { 573 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 574 }; 575 576 const LLT S1 = LLT::scalar(1); 577 const LLT S8 = LLT::scalar(8); 578 const LLT S16 = LLT::scalar(16); 579 const LLT S32 = LLT::scalar(32); 580 const LLT S64 = LLT::scalar(64); 581 const LLT S128 = LLT::scalar(128); 582 const LLT S256 = LLT::scalar(256); 583 const LLT S512 = LLT::scalar(512); 584 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 585 586 const LLT V2S8 = LLT::fixed_vector(2, 8); 587 const LLT V2S16 = LLT::fixed_vector(2, 16); 588 const LLT V4S16 = LLT::fixed_vector(4, 16); 589 590 const LLT V2S32 = LLT::fixed_vector(2, 32); 591 const LLT V3S32 = LLT::fixed_vector(3, 32); 592 const LLT V4S32 = LLT::fixed_vector(4, 32); 593 const LLT V5S32 = LLT::fixed_vector(5, 32); 594 const LLT V6S32 = LLT::fixed_vector(6, 32); 595 const LLT V7S32 = LLT::fixed_vector(7, 32); 596 const LLT V8S32 = LLT::fixed_vector(8, 32); 597 const LLT V9S32 = LLT::fixed_vector(9, 32); 598 const LLT V10S32 = LLT::fixed_vector(10, 32); 599 const LLT V11S32 = LLT::fixed_vector(11, 32); 600 const LLT V12S32 = LLT::fixed_vector(12, 32); 601 const LLT V13S32 = LLT::fixed_vector(13, 32); 602 const LLT V14S32 = LLT::fixed_vector(14, 32); 603 const LLT V15S32 = LLT::fixed_vector(15, 32); 604 const LLT V16S32 = LLT::fixed_vector(16, 32); 605 const LLT V32S32 = LLT::fixed_vector(32, 32); 606 607 const LLT V2S64 = LLT::fixed_vector(2, 64); 608 const LLT V3S64 = LLT::fixed_vector(3, 64); 609 const LLT V4S64 = LLT::fixed_vector(4, 64); 610 const LLT V5S64 = LLT::fixed_vector(5, 64); 611 const LLT V6S64 = LLT::fixed_vector(6, 64); 612 const LLT V7S64 = LLT::fixed_vector(7, 64); 613 const LLT V8S64 = LLT::fixed_vector(8, 64); 614 const LLT V16S64 = LLT::fixed_vector(16, 64); 615 616 std::initializer_list<LLT> AllS32Vectors = 617 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 618 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 619 std::initializer_list<LLT> AllS64Vectors = 620 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 621 622 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 623 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 624 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 625 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 626 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 627 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 628 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 629 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 630 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 631 632 const LLT CodePtr = FlatPtr; 633 634 const std::initializer_list<LLT> AddrSpaces64 = { 635 GlobalPtr, ConstantPtr, FlatPtr 636 }; 637 638 const std::initializer_list<LLT> AddrSpaces32 = { 639 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 640 }; 641 642 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 643 644 const std::initializer_list<LLT> FPTypesBase = { 645 S32, S64 646 }; 647 648 const std::initializer_list<LLT> FPTypes16 = { 649 S32, S64, S16 650 }; 651 652 const std::initializer_list<LLT> FPTypesPK16 = { 653 S32, S64, S16, V2S16 654 }; 655 656 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 657 658 // s1 for VCC branches, s32 for SCC branches. 659 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 660 661 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 662 // elements for v3s16 663 getActionDefinitionsBuilder(G_PHI) 664 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 665 .legalFor(AllS32Vectors) 666 .legalFor(AllS64Vectors) 667 .legalFor(AddrSpaces64) 668 .legalFor(AddrSpaces32) 669 .legalFor(AddrSpaces128) 670 .legalIf(isPointer(0)) 671 .clampScalar(0, S16, S256) 672 .widenScalarToNextPow2(0, 32) 673 .clampMaxNumElements(0, S32, 16) 674 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 675 .scalarize(0); 676 677 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 678 // Full set of gfx9 features. 679 getActionDefinitionsBuilder({G_ADD, G_SUB}) 680 .legalFor({S32, S16, V2S16}) 681 .clampMaxNumElementsStrict(0, S16, 2) 682 .scalarize(0) 683 .minScalar(0, S16) 684 .widenScalarToNextMultipleOf(0, 32) 685 .maxScalar(0, S32); 686 687 getActionDefinitionsBuilder(G_MUL) 688 .legalFor({S32, S16, V2S16}) 689 .clampMaxNumElementsStrict(0, S16, 2) 690 .scalarize(0) 691 .minScalar(0, S16) 692 .widenScalarToNextMultipleOf(0, 32) 693 .custom(); 694 assert(ST.hasMad64_32()); 695 696 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 697 .legalFor({S32, S16, V2S16}) // Clamp modifier 698 .minScalarOrElt(0, S16) 699 .clampMaxNumElementsStrict(0, S16, 2) 700 .scalarize(0) 701 .widenScalarToNextPow2(0, 32) 702 .lower(); 703 } else if (ST.has16BitInsts()) { 704 getActionDefinitionsBuilder({G_ADD, G_SUB}) 705 .legalFor({S32, S16}) 706 .minScalar(0, S16) 707 .widenScalarToNextMultipleOf(0, 32) 708 .maxScalar(0, S32) 709 .scalarize(0); 710 711 getActionDefinitionsBuilder(G_MUL) 712 .legalFor({S32, S16}) 713 .scalarize(0) 714 .minScalar(0, S16) 715 .widenScalarToNextMultipleOf(0, 32) 716 .custom(); 717 assert(ST.hasMad64_32()); 718 719 // Technically the saturating operations require clamp bit support, but this 720 // was introduced at the same time as 16-bit operations. 721 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 722 .legalFor({S32, S16}) // Clamp modifier 723 .minScalar(0, S16) 724 .scalarize(0) 725 .widenScalarToNextPow2(0, 16) 726 .lower(); 727 728 // We're just lowering this, but it helps get a better result to try to 729 // coerce to the desired type first. 730 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 731 .minScalar(0, S16) 732 .scalarize(0) 733 .lower(); 734 } else { 735 getActionDefinitionsBuilder({G_ADD, G_SUB}) 736 .legalFor({S32}) 737 .widenScalarToNextMultipleOf(0, 32) 738 .clampScalar(0, S32, S32) 739 .scalarize(0); 740 741 auto &Mul = getActionDefinitionsBuilder(G_MUL) 742 .legalFor({S32}) 743 .scalarize(0) 744 .minScalar(0, S32) 745 .widenScalarToNextMultipleOf(0, 32); 746 747 if (ST.hasMad64_32()) 748 Mul.custom(); 749 else 750 Mul.maxScalar(0, S32); 751 752 if (ST.hasIntClamp()) { 753 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 754 .legalFor({S32}) // Clamp modifier. 755 .scalarize(0) 756 .minScalarOrElt(0, S32) 757 .lower(); 758 } else { 759 // Clamp bit support was added in VI, along with 16-bit operations. 760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 761 .minScalar(0, S32) 762 .scalarize(0) 763 .lower(); 764 } 765 766 // FIXME: DAG expansion gets better results. The widening uses the smaller 767 // range values and goes for the min/max lowering directly. 768 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 769 .minScalar(0, S32) 770 .scalarize(0) 771 .lower(); 772 } 773 774 getActionDefinitionsBuilder( 775 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 776 .customFor({S32, S64}) 777 .clampScalar(0, S32, S64) 778 .widenScalarToNextPow2(0, 32) 779 .scalarize(0); 780 781 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 782 .legalFor({S32}) 783 .maxScalar(0, S32); 784 785 if (ST.hasVOP3PInsts()) { 786 Mulh 787 .clampMaxNumElements(0, S8, 2) 788 .lowerFor({V2S8}); 789 } 790 791 Mulh 792 .scalarize(0) 793 .lower(); 794 795 // Report legal for any types we can handle anywhere. For the cases only legal 796 // on the SALU, RegBankSelect will be able to re-legalize. 797 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 798 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 799 .clampScalar(0, S32, S64) 800 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 801 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 802 .widenScalarToNextPow2(0) 803 .scalarize(0); 804 805 getActionDefinitionsBuilder( 806 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 807 .legalFor({{S32, S1}, {S32, S32}}) 808 .clampScalar(0, S32, S32) 809 .scalarize(0); 810 811 getActionDefinitionsBuilder(G_BITCAST) 812 // Don't worry about the size constraint. 813 .legalIf(all(isRegisterType(0), isRegisterType(1))) 814 .lower(); 815 816 817 getActionDefinitionsBuilder(G_CONSTANT) 818 .legalFor({S1, S32, S64, S16, GlobalPtr, 819 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 820 .legalIf(isPointer(0)) 821 .clampScalar(0, S32, S64) 822 .widenScalarToNextPow2(0); 823 824 getActionDefinitionsBuilder(G_FCONSTANT) 825 .legalFor({S32, S64, S16}) 826 .clampScalar(0, S16, S64); 827 828 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 829 .legalIf(isRegisterType(0)) 830 // s1 and s16 are special cases because they have legal operations on 831 // them, but don't really occupy registers in the normal way. 832 .legalFor({S1, S16}) 833 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 834 .clampScalarOrElt(0, S32, MaxScalar) 835 .widenScalarToNextPow2(0, 32) 836 .clampMaxNumElements(0, S32, 16); 837 838 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 839 840 // If the amount is divergent, we have to do a wave reduction to get the 841 // maximum value, so this is expanded during RegBankSelect. 842 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 843 .legalFor({{PrivatePtr, S32}}); 844 845 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 846 .customIf(typeIsNot(0, PrivatePtr)); 847 848 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 849 850 auto &FPOpActions = getActionDefinitionsBuilder( 851 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 852 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 853 .legalFor({S32, S64}); 854 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 855 .customFor({S32, S64}); 856 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 857 .customFor({S32, S64}); 858 859 if (ST.has16BitInsts()) { 860 if (ST.hasVOP3PInsts()) 861 FPOpActions.legalFor({S16, V2S16}); 862 else 863 FPOpActions.legalFor({S16}); 864 865 TrigActions.customFor({S16}); 866 FDIVActions.customFor({S16}); 867 } 868 869 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 870 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 871 872 if (ST.hasVOP3PInsts()) { 873 MinNumMaxNum.customFor(FPTypesPK16) 874 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 875 .clampMaxNumElements(0, S16, 2) 876 .clampScalar(0, S16, S64) 877 .scalarize(0); 878 } else if (ST.has16BitInsts()) { 879 MinNumMaxNum.customFor(FPTypes16) 880 .clampScalar(0, S16, S64) 881 .scalarize(0); 882 } else { 883 MinNumMaxNum.customFor(FPTypesBase) 884 .clampScalar(0, S32, S64) 885 .scalarize(0); 886 } 887 888 if (ST.hasVOP3PInsts()) 889 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 890 891 FPOpActions 892 .scalarize(0) 893 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 894 895 TrigActions 896 .scalarize(0) 897 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 898 899 FDIVActions 900 .scalarize(0) 901 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 902 903 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 904 .legalFor(FPTypesPK16) 905 .clampMaxNumElementsStrict(0, S16, 2) 906 .scalarize(0) 907 .clampScalar(0, S16, S64); 908 909 if (ST.has16BitInsts()) { 910 getActionDefinitionsBuilder(G_FSQRT) 911 .legalFor({S32, S16}) 912 .customFor({S64}) 913 .scalarize(0) 914 .clampScalar(0, S16, S64); 915 getActionDefinitionsBuilder(G_FFLOOR) 916 .legalFor({S32, S64, S16}) 917 .scalarize(0) 918 .clampScalar(0, S16, S64); 919 920 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 921 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 922 .scalarize(0) 923 .maxScalarIf(typeIs(0, S16), 1, S16) 924 .clampScalar(1, S32, S32) 925 .lower(); 926 927 getActionDefinitionsBuilder(G_FFREXP) 928 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 929 .scalarize(0) 930 .lower(); 931 } else { 932 getActionDefinitionsBuilder(G_FSQRT) 933 .legalFor({S32}) 934 .customFor({S64}) 935 .scalarize(0) 936 .clampScalar(0, S32, S64); 937 938 if (ST.hasFractBug()) { 939 getActionDefinitionsBuilder(G_FFLOOR) 940 .customFor({S64}) 941 .legalFor({S32, S64}) 942 .scalarize(0) 943 .clampScalar(0, S32, S64); 944 } else { 945 getActionDefinitionsBuilder(G_FFLOOR) 946 .legalFor({S32, S64}) 947 .scalarize(0) 948 .clampScalar(0, S32, S64); 949 } 950 951 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 952 .legalFor({{S32, S32}, {S64, S32}}) 953 .scalarize(0) 954 .clampScalar(0, S32, S64) 955 .clampScalar(1, S32, S32) 956 .lower(); 957 958 getActionDefinitionsBuilder(G_FFREXP) 959 .customFor({{S32, S32}, {S64, S32}}) 960 .scalarize(0) 961 .minScalar(0, S32) 962 .clampScalar(1, S32, S32) 963 .lower(); 964 } 965 966 getActionDefinitionsBuilder(G_FPTRUNC) 967 .legalFor({{S32, S64}, {S16, S32}}) 968 .scalarize(0) 969 .lower(); 970 971 getActionDefinitionsBuilder(G_FPEXT) 972 .legalFor({{S64, S32}, {S32, S16}}) 973 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 974 .scalarize(0); 975 976 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 977 if (ST.has16BitInsts()) { 978 FSubActions 979 // Use actual fsub instruction 980 .legalFor({S32, S16}) 981 // Must use fadd + fneg 982 .lowerFor({S64, V2S16}); 983 } else { 984 FSubActions 985 // Use actual fsub instruction 986 .legalFor({S32}) 987 // Must use fadd + fneg 988 .lowerFor({S64, S16, V2S16}); 989 } 990 991 FSubActions 992 .scalarize(0) 993 .clampScalar(0, S32, S64); 994 995 // Whether this is legal depends on the floating point mode for the function. 996 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 997 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 998 FMad.customFor({S32, S16}); 999 else if (ST.hasMadMacF32Insts()) 1000 FMad.customFor({S32}); 1001 else if (ST.hasMadF16()) 1002 FMad.customFor({S16}); 1003 FMad.scalarize(0) 1004 .lower(); 1005 1006 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1007 if (ST.has16BitInsts()) { 1008 FRem.customFor({S16, S32, S64}); 1009 } else { 1010 FRem.minScalar(0, S32) 1011 .customFor({S32, S64}); 1012 } 1013 FRem.scalarize(0); 1014 1015 // TODO: Do we need to clamp maximum bitwidth? 1016 getActionDefinitionsBuilder(G_TRUNC) 1017 .legalIf(isScalar(0)) 1018 .legalFor({{V2S16, V2S32}}) 1019 .clampMaxNumElements(0, S16, 2) 1020 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1021 // situations (like an invalid implicit use), we don't want to infinite loop 1022 // in the legalizer. 1023 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1024 .alwaysLegal(); 1025 1026 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1027 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1028 {S32, S1}, {S64, S1}, {S16, S1}}) 1029 .scalarize(0) 1030 .clampScalar(0, S32, S64) 1031 .widenScalarToNextPow2(1, 32); 1032 1033 // TODO: Split s1->s64 during regbankselect for VALU. 1034 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1035 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1036 .lowerIf(typeIs(1, S1)) 1037 .customFor({{S32, S64}, {S64, S64}}); 1038 if (ST.has16BitInsts()) 1039 IToFP.legalFor({{S16, S16}}); 1040 IToFP.clampScalar(1, S32, S64) 1041 .minScalar(0, S32) 1042 .scalarize(0) 1043 .widenScalarToNextPow2(1); 1044 1045 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1046 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1047 .customFor({{S64, S32}, {S64, S64}}) 1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1049 if (ST.has16BitInsts()) 1050 FPToI.legalFor({{S16, S16}}); 1051 else 1052 FPToI.minScalar(1, S32); 1053 1054 FPToI.minScalar(0, S32) 1055 .widenScalarToNextPow2(0, 32) 1056 .scalarize(0) 1057 .lower(); 1058 1059 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1060 .customFor({S16, S32}) 1061 .scalarize(0) 1062 .lower(); 1063 1064 // Lower roundeven into G_FRINT 1065 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 1066 .scalarize(0) 1067 .lower(); 1068 1069 if (ST.has16BitInsts()) { 1070 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1071 .legalFor({S16, S32, S64}) 1072 .clampScalar(0, S16, S64) 1073 .scalarize(0); 1074 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1075 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1076 .legalFor({S32, S64}) 1077 .clampScalar(0, S32, S64) 1078 .scalarize(0); 1079 } else { 1080 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1081 .legalFor({S32}) 1082 .customFor({S64}) 1083 .clampScalar(0, S32, S64) 1084 .scalarize(0); 1085 } 1086 1087 getActionDefinitionsBuilder(G_PTR_ADD) 1088 .unsupportedFor({BufferFatPtr, RsrcPtr}) 1089 .legalIf(all(isPointer(0), sameSize(0, 1))) 1090 .scalarize(0) 1091 .scalarSameSizeAs(1, 0); 1092 1093 getActionDefinitionsBuilder(G_PTRMASK) 1094 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1095 .scalarSameSizeAs(1, 0) 1096 .scalarize(0); 1097 1098 auto &CmpBuilder = 1099 getActionDefinitionsBuilder(G_ICMP) 1100 // The compare output type differs based on the register bank of the output, 1101 // so make both s1 and s32 legal. 1102 // 1103 // Scalar compares producing output in scc will be promoted to s32, as that 1104 // is the allocatable register type that will be needed for the copy from 1105 // scc. This will be promoted during RegBankSelect, and we assume something 1106 // before that won't try to use s32 result types. 1107 // 1108 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1109 // bank. 1110 .legalForCartesianProduct( 1111 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1112 .legalForCartesianProduct( 1113 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1114 if (ST.has16BitInsts()) { 1115 CmpBuilder.legalFor({{S1, S16}}); 1116 } 1117 1118 CmpBuilder 1119 .widenScalarToNextPow2(1) 1120 .clampScalar(1, S32, S64) 1121 .scalarize(0) 1122 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1123 1124 getActionDefinitionsBuilder(G_FCMP) 1125 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 1126 .widenScalarToNextPow2(1) 1127 .clampScalar(1, S32, S64) 1128 .scalarize(0); 1129 1130 // FIXME: fpow has a selection pattern that should move to custom lowering. 1131 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1132 if (ST.has16BitInsts()) 1133 ExpOps.customFor({{S32}, {S16}}); 1134 else 1135 ExpOps.customFor({S32}); 1136 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1137 .scalarize(0); 1138 1139 getActionDefinitionsBuilder(G_FPOWI) 1140 .clampScalar(0, MinScalarFPTy, S32) 1141 .lower(); 1142 1143 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1144 Log2Ops.customFor({S32}); 1145 if (ST.has16BitInsts()) 1146 Log2Ops.legalFor({S16}); 1147 else 1148 Log2Ops.customFor({S16}); 1149 Log2Ops.scalarize(0) 1150 .lower(); 1151 1152 auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP}); 1153 LogOps.customFor({S32, S16}); 1154 LogOps.clampScalar(0, MinScalarFPTy, S32) 1155 .scalarize(0); 1156 1157 // The 64-bit versions produce 32-bit results, but only on the SALU. 1158 getActionDefinitionsBuilder(G_CTPOP) 1159 .legalFor({{S32, S32}, {S32, S64}}) 1160 .clampScalar(0, S32, S32) 1161 .widenScalarToNextPow2(1, 32) 1162 .clampScalar(1, S32, S64) 1163 .scalarize(0) 1164 .widenScalarToNextPow2(0, 32); 1165 1166 // If no 16 bit instr is available, lower into different instructions. 1167 if (ST.has16BitInsts()) 1168 getActionDefinitionsBuilder(G_IS_FPCLASS) 1169 .legalForCartesianProduct({S1}, FPTypes16) 1170 .widenScalarToNextPow2(1) 1171 .scalarize(0) 1172 .lower(); 1173 else 1174 getActionDefinitionsBuilder(G_IS_FPCLASS) 1175 .legalForCartesianProduct({S1}, FPTypesBase) 1176 .lowerFor({S1, S16}) 1177 .widenScalarToNextPow2(1) 1178 .scalarize(0) 1179 .lower(); 1180 1181 // The hardware instructions return a different result on 0 than the generic 1182 // instructions expect. The hardware produces -1, but these produce the 1183 // bitwidth. 1184 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1185 .scalarize(0) 1186 .clampScalar(0, S32, S32) 1187 .clampScalar(1, S32, S64) 1188 .widenScalarToNextPow2(0, 32) 1189 .widenScalarToNextPow2(1, 32) 1190 .custom(); 1191 1192 // The 64-bit versions produce 32-bit results, but only on the SALU. 1193 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 1194 .legalFor({{S32, S32}, {S32, S64}}) 1195 .clampScalar(0, S32, S32) 1196 .clampScalar(1, S32, S64) 1197 .scalarize(0) 1198 .widenScalarToNextPow2(0, 32) 1199 .widenScalarToNextPow2(1, 32); 1200 1201 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1202 // RegBankSelect. 1203 getActionDefinitionsBuilder(G_BITREVERSE) 1204 .legalFor({S32, S64}) 1205 .clampScalar(0, S32, S64) 1206 .scalarize(0) 1207 .widenScalarToNextPow2(0); 1208 1209 if (ST.has16BitInsts()) { 1210 getActionDefinitionsBuilder(G_BSWAP) 1211 .legalFor({S16, S32, V2S16}) 1212 .clampMaxNumElementsStrict(0, S16, 2) 1213 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1214 // narrowScalar limitation. 1215 .widenScalarToNextPow2(0) 1216 .clampScalar(0, S16, S32) 1217 .scalarize(0); 1218 1219 if (ST.hasVOP3PInsts()) { 1220 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1221 .legalFor({S32, S16, V2S16}) 1222 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1223 .clampMaxNumElements(0, S16, 2) 1224 .minScalar(0, S16) 1225 .widenScalarToNextPow2(0) 1226 .scalarize(0) 1227 .lower(); 1228 } else { 1229 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1230 .legalFor({S32, S16}) 1231 .widenScalarToNextPow2(0) 1232 .minScalar(0, S16) 1233 .scalarize(0) 1234 .lower(); 1235 } 1236 } else { 1237 // TODO: Should have same legality without v_perm_b32 1238 getActionDefinitionsBuilder(G_BSWAP) 1239 .legalFor({S32}) 1240 .lowerIf(scalarNarrowerThan(0, 32)) 1241 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1242 // narrowScalar limitation. 1243 .widenScalarToNextPow2(0) 1244 .maxScalar(0, S32) 1245 .scalarize(0) 1246 .lower(); 1247 1248 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1249 .legalFor({S32}) 1250 .minScalar(0, S32) 1251 .widenScalarToNextPow2(0) 1252 .scalarize(0) 1253 .lower(); 1254 } 1255 1256 getActionDefinitionsBuilder(G_INTTOPTR) 1257 // List the common cases 1258 .legalForCartesianProduct(AddrSpaces64, {S64}) 1259 .legalForCartesianProduct(AddrSpaces32, {S32}) 1260 .scalarize(0) 1261 // Accept any address space as long as the size matches 1262 .legalIf(sameSize(0, 1)) 1263 .widenScalarIf(smallerThan(1, 0), 1264 [](const LegalityQuery &Query) { 1265 return std::pair( 1266 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1267 }) 1268 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1269 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1270 }); 1271 1272 getActionDefinitionsBuilder(G_PTRTOINT) 1273 // List the common cases 1274 .legalForCartesianProduct(AddrSpaces64, {S64}) 1275 .legalForCartesianProduct(AddrSpaces32, {S32}) 1276 .scalarize(0) 1277 // Accept any address space as long as the size matches 1278 .legalIf(sameSize(0, 1)) 1279 .widenScalarIf(smallerThan(0, 1), 1280 [](const LegalityQuery &Query) { 1281 return std::pair( 1282 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1283 }) 1284 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1285 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1286 }); 1287 1288 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1289 .scalarize(0) 1290 .custom(); 1291 1292 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1293 bool IsLoad) -> bool { 1294 const LLT DstTy = Query.Types[0]; 1295 1296 // Split vector extloads. 1297 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1298 1299 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1300 return true; 1301 1302 const LLT PtrTy = Query.Types[1]; 1303 unsigned AS = PtrTy.getAddressSpace(); 1304 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1305 Query.MMODescrs[0].Ordering != 1306 AtomicOrdering::NotAtomic)) 1307 return true; 1308 1309 // Catch weird sized loads that don't evenly divide into the access sizes 1310 // TODO: May be able to widen depending on alignment etc. 1311 unsigned NumRegs = (MemSize + 31) / 32; 1312 if (NumRegs == 3) { 1313 if (!ST.hasDwordx3LoadStores()) 1314 return true; 1315 } else { 1316 // If the alignment allows, these should have been widened. 1317 if (!isPowerOf2_32(NumRegs)) 1318 return true; 1319 } 1320 1321 return false; 1322 }; 1323 1324 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1325 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1326 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1327 1328 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1329 // LDS 1330 // TODO: Unsupported flat for SI. 1331 1332 for (unsigned Op : {G_LOAD, G_STORE}) { 1333 const bool IsStore = Op == G_STORE; 1334 1335 auto &Actions = getActionDefinitionsBuilder(Op); 1336 // Explicitly list some common cases. 1337 // TODO: Does this help compile time at all? 1338 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1339 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1340 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1341 {S64, GlobalPtr, S64, GlobalAlign32}, 1342 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1343 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1344 {S32, GlobalPtr, S8, GlobalAlign8}, 1345 {S32, GlobalPtr, S16, GlobalAlign16}, 1346 1347 {S32, LocalPtr, S32, 32}, 1348 {S64, LocalPtr, S64, 32}, 1349 {V2S32, LocalPtr, V2S32, 32}, 1350 {S32, LocalPtr, S8, 8}, 1351 {S32, LocalPtr, S16, 16}, 1352 {V2S16, LocalPtr, S32, 32}, 1353 1354 {S32, PrivatePtr, S32, 32}, 1355 {S32, PrivatePtr, S8, 8}, 1356 {S32, PrivatePtr, S16, 16}, 1357 {V2S16, PrivatePtr, S32, 32}, 1358 1359 {S32, ConstantPtr, S32, GlobalAlign32}, 1360 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1361 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1362 {S64, ConstantPtr, S64, GlobalAlign32}, 1363 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1364 Actions.legalIf( 1365 [=](const LegalityQuery &Query) -> bool { 1366 return isLoadStoreLegal(ST, Query); 1367 }); 1368 1369 // The custom pointers (fat pointers, buffer resources) don't work with load 1370 // and store at this level. Fat pointers should have been lowered to 1371 // intrinsics before the translation to MIR. 1372 Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); 1373 1374 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1375 // ptrtoint. This is needed to account for the fact that we can't have i128 1376 // as a register class for SelectionDAG reasons. 1377 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1378 return hasBufferRsrcWorkaround(Query.Types[0]); 1379 }); 1380 1381 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1382 // 64-bits. 1383 // 1384 // TODO: Should generalize bitcast action into coerce, which will also cover 1385 // inserting addrspacecasts. 1386 Actions.customIf(typeIs(1, Constant32Ptr)); 1387 1388 // Turn any illegal element vectors into something easier to deal 1389 // with. These will ultimately produce 32-bit scalar shifts to extract the 1390 // parts anyway. 1391 // 1392 // For odd 16-bit element vectors, prefer to split those into pieces with 1393 // 16-bit vector parts. 1394 Actions.bitcastIf( 1395 [=](const LegalityQuery &Query) -> bool { 1396 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1397 Query.MMODescrs[0].MemoryTy); 1398 }, bitcastToRegisterType(0)); 1399 1400 if (!IsStore) { 1401 // Widen suitably aligned loads by loading extra bytes. The standard 1402 // legalization actions can't properly express widening memory operands. 1403 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1404 return shouldWidenLoad(ST, Query, G_LOAD); 1405 }); 1406 } 1407 1408 // FIXME: load/store narrowing should be moved to lower action 1409 Actions 1410 .narrowScalarIf( 1411 [=](const LegalityQuery &Query) -> bool { 1412 return !Query.Types[0].isVector() && 1413 needToSplitMemOp(Query, Op == G_LOAD); 1414 }, 1415 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1416 const LLT DstTy = Query.Types[0]; 1417 const LLT PtrTy = Query.Types[1]; 1418 1419 const unsigned DstSize = DstTy.getSizeInBits(); 1420 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1421 1422 // Split extloads. 1423 if (DstSize > MemSize) 1424 return std::pair(0, LLT::scalar(MemSize)); 1425 1426 unsigned MaxSize = maxSizeForAddrSpace( 1427 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1428 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1429 if (MemSize > MaxSize) 1430 return std::pair(0, LLT::scalar(MaxSize)); 1431 1432 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1433 return std::pair(0, LLT::scalar(Align)); 1434 }) 1435 .fewerElementsIf( 1436 [=](const LegalityQuery &Query) -> bool { 1437 return Query.Types[0].isVector() && 1438 needToSplitMemOp(Query, Op == G_LOAD); 1439 }, 1440 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1441 const LLT DstTy = Query.Types[0]; 1442 const LLT PtrTy = Query.Types[1]; 1443 1444 LLT EltTy = DstTy.getElementType(); 1445 unsigned MaxSize = maxSizeForAddrSpace( 1446 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1447 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1448 1449 // FIXME: Handle widened to power of 2 results better. This ends 1450 // up scalarizing. 1451 // FIXME: 3 element stores scalarized on SI 1452 1453 // Split if it's too large for the address space. 1454 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1455 if (MemSize > MaxSize) { 1456 unsigned NumElts = DstTy.getNumElements(); 1457 unsigned EltSize = EltTy.getSizeInBits(); 1458 1459 if (MaxSize % EltSize == 0) { 1460 return std::pair( 1461 0, LLT::scalarOrVector( 1462 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1463 } 1464 1465 unsigned NumPieces = MemSize / MaxSize; 1466 1467 // FIXME: Refine when odd breakdowns handled 1468 // The scalars will need to be re-legalized. 1469 if (NumPieces == 1 || NumPieces >= NumElts || 1470 NumElts % NumPieces != 0) 1471 return std::pair(0, EltTy); 1472 1473 return std::pair(0, 1474 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1475 } 1476 1477 // FIXME: We could probably handle weird extending loads better. 1478 if (DstTy.getSizeInBits() > MemSize) 1479 return std::pair(0, EltTy); 1480 1481 unsigned EltSize = EltTy.getSizeInBits(); 1482 unsigned DstSize = DstTy.getSizeInBits(); 1483 if (!isPowerOf2_32(DstSize)) { 1484 // We're probably decomposing an odd sized store. Try to split 1485 // to the widest type. TODO: Account for alignment. As-is it 1486 // should be OK, since the new parts will be further legalized. 1487 unsigned FloorSize = llvm::bit_floor(DstSize); 1488 return std::pair( 1489 0, LLT::scalarOrVector( 1490 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1491 } 1492 1493 // May need relegalization for the scalars. 1494 return std::pair(0, EltTy); 1495 }) 1496 .minScalar(0, S32) 1497 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1498 .widenScalarToNextPow2(0) 1499 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1500 .lower(); 1501 } 1502 1503 // FIXME: Unaligned accesses not lowered. 1504 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1505 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1506 {S32, GlobalPtr, S16, 2 * 8}, 1507 {S32, LocalPtr, S8, 8}, 1508 {S32, LocalPtr, S16, 16}, 1509 {S32, PrivatePtr, S8, 8}, 1510 {S32, PrivatePtr, S16, 16}, 1511 {S32, ConstantPtr, S8, 8}, 1512 {S32, ConstantPtr, S16, 2 * 8}}) 1513 .legalIf( 1514 [=](const LegalityQuery &Query) -> bool { 1515 return isLoadStoreLegal(ST, Query); 1516 }); 1517 1518 if (ST.hasFlatAddressSpace()) { 1519 ExtLoads.legalForTypesWithMemDesc( 1520 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1521 } 1522 1523 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1524 // 64-bits. 1525 // 1526 // TODO: Should generalize bitcast action into coerce, which will also cover 1527 // inserting addrspacecasts. 1528 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1529 1530 ExtLoads.clampScalar(0, S32, S32) 1531 .widenScalarToNextPow2(0) 1532 .lower(); 1533 1534 auto &Atomics = getActionDefinitionsBuilder( 1535 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1536 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1537 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1538 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1539 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1540 {S64, GlobalPtr}, {S64, LocalPtr}, 1541 {S32, RegionPtr}, {S64, RegionPtr}}); 1542 if (ST.hasFlatAddressSpace()) { 1543 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1544 } 1545 1546 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1547 if (ST.hasLDSFPAtomicAdd()) { 1548 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1549 if (ST.hasGFX90AInsts()) 1550 Atomic.legalFor({{S64, LocalPtr}}); 1551 if (ST.hasAtomicDsPkAdd16Insts()) 1552 Atomic.legalFor({{V2S16, LocalPtr}}); 1553 } 1554 if (ST.hasAtomicFaddInsts()) 1555 Atomic.legalFor({{S32, GlobalPtr}}); 1556 if (ST.hasFlatAtomicFaddF32Inst()) 1557 Atomic.legalFor({{S32, FlatPtr}}); 1558 1559 if (ST.hasGFX90AInsts()) { 1560 // These are legal with some caveats, and should have undergone expansion in 1561 // the IR in most situations 1562 // TODO: Move atomic expansion into legalizer 1563 Atomic.legalFor({ 1564 {S32, GlobalPtr}, 1565 {S64, GlobalPtr}, 1566 {S64, FlatPtr} 1567 }); 1568 } 1569 1570 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1571 // demarshalling 1572 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1573 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1574 {S32, FlatPtr}, {S64, FlatPtr}}) 1575 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1576 {S32, RegionPtr}, {S64, RegionPtr}}); 1577 // TODO: Pointer types, any 32-bit or 64-bit vector 1578 1579 // Condition should be s32 for scalar, s1 for vector. 1580 getActionDefinitionsBuilder(G_SELECT) 1581 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1582 LocalPtr, FlatPtr, PrivatePtr, 1583 LLT::fixed_vector(2, LocalPtr), 1584 LLT::fixed_vector(2, PrivatePtr)}, 1585 {S1, S32}) 1586 .clampScalar(0, S16, S64) 1587 .scalarize(1) 1588 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1589 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1590 .clampMaxNumElements(0, S32, 2) 1591 .clampMaxNumElements(0, LocalPtr, 2) 1592 .clampMaxNumElements(0, PrivatePtr, 2) 1593 .scalarize(0) 1594 .widenScalarToNextPow2(0) 1595 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1596 1597 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1598 // be more flexible with the shift amount type. 1599 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1600 .legalFor({{S32, S32}, {S64, S32}}); 1601 if (ST.has16BitInsts()) { 1602 if (ST.hasVOP3PInsts()) { 1603 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1604 .clampMaxNumElements(0, S16, 2); 1605 } else 1606 Shifts.legalFor({{S16, S16}}); 1607 1608 // TODO: Support 16-bit shift amounts for all types 1609 Shifts.widenScalarIf( 1610 [=](const LegalityQuery &Query) { 1611 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1612 // 32-bit amount. 1613 const LLT ValTy = Query.Types[0]; 1614 const LLT AmountTy = Query.Types[1]; 1615 return ValTy.getSizeInBits() <= 16 && 1616 AmountTy.getSizeInBits() < 16; 1617 }, changeTo(1, S16)); 1618 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1619 Shifts.clampScalar(1, S32, S32); 1620 Shifts.widenScalarToNextPow2(0, 16); 1621 Shifts.clampScalar(0, S16, S64); 1622 1623 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1624 .minScalar(0, S16) 1625 .scalarize(0) 1626 .lower(); 1627 } else { 1628 // Make sure we legalize the shift amount type first, as the general 1629 // expansion for the shifted type will produce much worse code if it hasn't 1630 // been truncated already. 1631 Shifts.clampScalar(1, S32, S32); 1632 Shifts.widenScalarToNextPow2(0, 32); 1633 Shifts.clampScalar(0, S32, S64); 1634 1635 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1636 .minScalar(0, S32) 1637 .scalarize(0) 1638 .lower(); 1639 } 1640 Shifts.scalarize(0); 1641 1642 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1643 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1644 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1645 unsigned IdxTypeIdx = 2; 1646 1647 getActionDefinitionsBuilder(Op) 1648 .customIf([=](const LegalityQuery &Query) { 1649 const LLT EltTy = Query.Types[EltTypeIdx]; 1650 const LLT VecTy = Query.Types[VecTypeIdx]; 1651 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1652 const unsigned EltSize = EltTy.getSizeInBits(); 1653 const bool isLegalVecType = 1654 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1655 // Address space 8 pointers are 128-bit wide values, but the logic 1656 // below will try to bitcast them to 2N x s64, which will fail. 1657 // Therefore, as an intermediate step, wrap extracts/insertions from a 1658 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1659 // extraction result) in order to produce a vector operation that can 1660 // be handled by the logic below. 1661 if (EltTy.isPointer() && EltSize > 64) 1662 return true; 1663 return (EltSize == 32 || EltSize == 64) && 1664 VecTy.getSizeInBits() % 32 == 0 && 1665 VecTy.getSizeInBits() <= MaxRegisterSize && 1666 IdxTy.getSizeInBits() == 32 && 1667 isLegalVecType; 1668 }) 1669 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1670 bitcastToVectorElement32(VecTypeIdx)) 1671 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1672 .bitcastIf( 1673 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1674 [=](const LegalityQuery &Query) { 1675 // For > 64-bit element types, try to turn this into a 64-bit 1676 // element vector since we may be able to do better indexing 1677 // if this is scalar. If not, fall back to 32. 1678 const LLT EltTy = Query.Types[EltTypeIdx]; 1679 const LLT VecTy = Query.Types[VecTypeIdx]; 1680 const unsigned DstEltSize = EltTy.getSizeInBits(); 1681 const unsigned VecSize = VecTy.getSizeInBits(); 1682 1683 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1684 return std::pair( 1685 VecTypeIdx, 1686 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1687 }) 1688 .clampScalar(EltTypeIdx, S32, S64) 1689 .clampScalar(VecTypeIdx, S32, S64) 1690 .clampScalar(IdxTypeIdx, S32, S32) 1691 .clampMaxNumElements(VecTypeIdx, S32, 32) 1692 // TODO: Clamp elements for 64-bit vectors? 1693 .moreElementsIf( 1694 isIllegalRegisterType(VecTypeIdx), 1695 moreElementsToNextExistingRegClass(VecTypeIdx)) 1696 // It should only be necessary with variable indexes. 1697 // As a last resort, lower to the stack 1698 .lower(); 1699 } 1700 1701 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1702 .unsupportedIf([=](const LegalityQuery &Query) { 1703 const LLT &EltTy = Query.Types[1].getElementType(); 1704 return Query.Types[0] != EltTy; 1705 }); 1706 1707 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1708 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1709 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1710 1711 // FIXME: Doesn't handle extract of illegal sizes. 1712 getActionDefinitionsBuilder(Op) 1713 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1714 .lowerIf([=](const LegalityQuery &Query) { 1715 // Sub-vector(or single element) insert and extract. 1716 // TODO: verify immediate offset here since lower only works with 1717 // whole elements. 1718 const LLT BigTy = Query.Types[BigTyIdx]; 1719 return BigTy.isVector(); 1720 }) 1721 // FIXME: Multiples of 16 should not be legal. 1722 .legalIf([=](const LegalityQuery &Query) { 1723 const LLT BigTy = Query.Types[BigTyIdx]; 1724 const LLT LitTy = Query.Types[LitTyIdx]; 1725 return (BigTy.getSizeInBits() % 32 == 0) && 1726 (LitTy.getSizeInBits() % 16 == 0); 1727 }) 1728 .widenScalarIf( 1729 [=](const LegalityQuery &Query) { 1730 const LLT BigTy = Query.Types[BigTyIdx]; 1731 return (BigTy.getScalarSizeInBits() < 16); 1732 }, 1733 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1734 .widenScalarIf( 1735 [=](const LegalityQuery &Query) { 1736 const LLT LitTy = Query.Types[LitTyIdx]; 1737 return (LitTy.getScalarSizeInBits() < 16); 1738 }, 1739 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1740 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1741 .widenScalarToNextPow2(BigTyIdx, 32); 1742 1743 } 1744 1745 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1746 .legalForCartesianProduct(AllS32Vectors, {S32}) 1747 .legalForCartesianProduct(AllS64Vectors, {S64}) 1748 .clampNumElements(0, V16S32, V32S32) 1749 .clampNumElements(0, V2S64, V16S64) 1750 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1751 .moreElementsIf( 1752 isIllegalRegisterType(0), 1753 moreElementsToNextExistingRegClass(0)); 1754 1755 if (ST.hasScalarPackInsts()) { 1756 BuildVector 1757 // FIXME: Should probably widen s1 vectors straight to s32 1758 .minScalarOrElt(0, S16) 1759 .minScalar(1, S16); 1760 1761 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1762 .legalFor({V2S16, S32}) 1763 .lower(); 1764 } else { 1765 BuildVector.customFor({V2S16, S16}); 1766 BuildVector.minScalarOrElt(0, S32); 1767 1768 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1769 .customFor({V2S16, S32}) 1770 .lower(); 1771 } 1772 1773 BuildVector.legalIf(isRegisterType(0)); 1774 1775 // FIXME: Clamp maximum size 1776 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1777 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1778 .clampMaxNumElements(0, S32, 32) 1779 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1780 .clampMaxNumElements(0, S16, 64); 1781 1782 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1783 1784 // Merge/Unmerge 1785 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1786 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1787 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1788 1789 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1790 const LLT Ty = Query.Types[TypeIdx]; 1791 if (Ty.isVector()) { 1792 const LLT &EltTy = Ty.getElementType(); 1793 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1794 return true; 1795 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1796 return true; 1797 } 1798 return false; 1799 }; 1800 1801 auto &Builder = getActionDefinitionsBuilder(Op) 1802 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1803 .lowerFor({{S16, V2S16}}) 1804 .lowerIf([=](const LegalityQuery &Query) { 1805 const LLT BigTy = Query.Types[BigTyIdx]; 1806 return BigTy.getSizeInBits() == 32; 1807 }) 1808 // Try to widen to s16 first for small types. 1809 // TODO: Only do this on targets with legal s16 shifts 1810 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1811 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1812 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1813 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1814 elementTypeIs(1, S16)), 1815 changeTo(1, V2S16)) 1816 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1817 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1818 // valid. 1819 .clampScalar(LitTyIdx, S32, S512) 1820 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1821 // Break up vectors with weird elements into scalars 1822 .fewerElementsIf( 1823 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1824 scalarize(0)) 1825 .fewerElementsIf( 1826 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1827 scalarize(1)) 1828 .clampScalar(BigTyIdx, S32, MaxScalar); 1829 1830 if (Op == G_MERGE_VALUES) { 1831 Builder.widenScalarIf( 1832 // TODO: Use 16-bit shifts if legal for 8-bit values? 1833 [=](const LegalityQuery &Query) { 1834 const LLT Ty = Query.Types[LitTyIdx]; 1835 return Ty.getSizeInBits() < 32; 1836 }, 1837 changeTo(LitTyIdx, S32)); 1838 } 1839 1840 Builder.widenScalarIf( 1841 [=](const LegalityQuery &Query) { 1842 const LLT Ty = Query.Types[BigTyIdx]; 1843 return Ty.getSizeInBits() % 16 != 0; 1844 }, 1845 [=](const LegalityQuery &Query) { 1846 // Pick the next power of 2, or a multiple of 64 over 128. 1847 // Whichever is smaller. 1848 const LLT &Ty = Query.Types[BigTyIdx]; 1849 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1850 if (NewSizeInBits >= 256) { 1851 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1852 if (RoundedTo < NewSizeInBits) 1853 NewSizeInBits = RoundedTo; 1854 } 1855 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1856 }) 1857 // Any vectors left are the wrong size. Scalarize them. 1858 .scalarize(0) 1859 .scalarize(1); 1860 } 1861 1862 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1863 // RegBankSelect. 1864 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1865 .legalFor({{S32}, {S64}}); 1866 1867 if (ST.hasVOP3PInsts()) { 1868 SextInReg.lowerFor({{V2S16}}) 1869 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1870 // get more vector shift opportunities, since we'll get those when 1871 // expanded. 1872 .clampMaxNumElementsStrict(0, S16, 2); 1873 } else if (ST.has16BitInsts()) { 1874 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1875 } else { 1876 // Prefer to promote to s32 before lowering if we don't have 16-bit 1877 // shifts. This avoid a lot of intermediate truncate and extend operations. 1878 SextInReg.lowerFor({{S32}, {S64}}); 1879 } 1880 1881 SextInReg 1882 .scalarize(0) 1883 .clampScalar(0, S32, S64) 1884 .lower(); 1885 1886 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1887 .scalarize(0) 1888 .lower(); 1889 1890 // TODO: Only Try to form v2s16 with legal packed instructions. 1891 getActionDefinitionsBuilder(G_FSHR) 1892 .legalFor({{S32, S32}}) 1893 .lowerFor({{V2S16, V2S16}}) 1894 .clampMaxNumElementsStrict(0, S16, 2) 1895 .scalarize(0) 1896 .lower(); 1897 1898 if (ST.hasVOP3PInsts()) { 1899 getActionDefinitionsBuilder(G_FSHL) 1900 .lowerFor({{V2S16, V2S16}}) 1901 .clampMaxNumElementsStrict(0, S16, 2) 1902 .scalarize(0) 1903 .lower(); 1904 } else { 1905 getActionDefinitionsBuilder(G_FSHL) 1906 .scalarize(0) 1907 .lower(); 1908 } 1909 1910 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1911 .legalFor({S64}); 1912 1913 getActionDefinitionsBuilder(G_FENCE) 1914 .alwaysLegal(); 1915 1916 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1917 .scalarize(0) 1918 .minScalar(0, S32) 1919 .lower(); 1920 1921 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1922 .legalFor({{S32, S32}, {S64, S32}}) 1923 .clampScalar(1, S32, S32) 1924 .clampScalar(0, S32, S64) 1925 .widenScalarToNextPow2(0) 1926 .scalarize(0); 1927 1928 getActionDefinitionsBuilder({ 1929 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1930 G_FCOPYSIGN, 1931 1932 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1933 G_ATOMICRMW_NAND, 1934 G_ATOMICRMW_FSUB, 1935 G_READ_REGISTER, 1936 G_WRITE_REGISTER, 1937 1938 G_SADDO, G_SSUBO, 1939 1940 // TODO: Implement 1941 G_FMINIMUM, G_FMAXIMUM}).lower(); 1942 1943 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1944 .lower(); 1945 1946 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1947 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1948 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1949 .unsupported(); 1950 1951 getLegacyLegalizerInfo().computeTables(); 1952 verify(*ST.getInstrInfo()); 1953 } 1954 1955 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1956 MachineInstr &MI) const { 1957 MachineIRBuilder &B = Helper.MIRBuilder; 1958 MachineRegisterInfo &MRI = *B.getMRI(); 1959 1960 switch (MI.getOpcode()) { 1961 case TargetOpcode::G_ADDRSPACE_CAST: 1962 return legalizeAddrSpaceCast(MI, MRI, B); 1963 case TargetOpcode::G_FRINT: 1964 return legalizeFrint(MI, MRI, B); 1965 case TargetOpcode::G_FCEIL: 1966 return legalizeFceil(MI, MRI, B); 1967 case TargetOpcode::G_FREM: 1968 return legalizeFrem(MI, MRI, B); 1969 case TargetOpcode::G_INTRINSIC_TRUNC: 1970 return legalizeIntrinsicTrunc(MI, MRI, B); 1971 case TargetOpcode::G_SITOFP: 1972 return legalizeITOFP(MI, MRI, B, true); 1973 case TargetOpcode::G_UITOFP: 1974 return legalizeITOFP(MI, MRI, B, false); 1975 case TargetOpcode::G_FPTOSI: 1976 return legalizeFPTOI(MI, MRI, B, true); 1977 case TargetOpcode::G_FPTOUI: 1978 return legalizeFPTOI(MI, MRI, B, false); 1979 case TargetOpcode::G_FMINNUM: 1980 case TargetOpcode::G_FMAXNUM: 1981 case TargetOpcode::G_FMINNUM_IEEE: 1982 case TargetOpcode::G_FMAXNUM_IEEE: 1983 return legalizeMinNumMaxNum(Helper, MI); 1984 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1985 return legalizeExtractVectorElt(MI, MRI, B); 1986 case TargetOpcode::G_INSERT_VECTOR_ELT: 1987 return legalizeInsertVectorElt(MI, MRI, B); 1988 case TargetOpcode::G_FSIN: 1989 case TargetOpcode::G_FCOS: 1990 return legalizeSinCos(MI, MRI, B); 1991 case TargetOpcode::G_GLOBAL_VALUE: 1992 return legalizeGlobalValue(MI, MRI, B); 1993 case TargetOpcode::G_LOAD: 1994 case TargetOpcode::G_SEXTLOAD: 1995 case TargetOpcode::G_ZEXTLOAD: 1996 return legalizeLoad(Helper, MI); 1997 case TargetOpcode::G_STORE: 1998 return legalizeStore(Helper, MI); 1999 case TargetOpcode::G_FMAD: 2000 return legalizeFMad(MI, MRI, B); 2001 case TargetOpcode::G_FDIV: 2002 return legalizeFDIV(MI, MRI, B); 2003 case TargetOpcode::G_FFREXP: 2004 return legalizeFFREXP(MI, MRI, B); 2005 case TargetOpcode::G_FSQRT: 2006 return legalizeFSQRT(MI, MRI, B); 2007 case TargetOpcode::G_UDIV: 2008 case TargetOpcode::G_UREM: 2009 case TargetOpcode::G_UDIVREM: 2010 return legalizeUnsignedDIV_REM(MI, MRI, B); 2011 case TargetOpcode::G_SDIV: 2012 case TargetOpcode::G_SREM: 2013 case TargetOpcode::G_SDIVREM: 2014 return legalizeSignedDIV_REM(MI, MRI, B); 2015 case TargetOpcode::G_ATOMIC_CMPXCHG: 2016 return legalizeAtomicCmpXChg(MI, MRI, B); 2017 case TargetOpcode::G_FLOG2: 2018 return legalizeFlog2(MI, B); 2019 case TargetOpcode::G_FLOG: 2020 case TargetOpcode::G_FLOG10: 2021 return legalizeFlogCommon(MI, B); 2022 case TargetOpcode::G_FEXP2: 2023 return legalizeFExp2(MI, B); 2024 case TargetOpcode::G_FEXP: 2025 return legalizeFExp(MI, B); 2026 case TargetOpcode::G_FPOW: 2027 return legalizeFPow(MI, B); 2028 case TargetOpcode::G_FFLOOR: 2029 return legalizeFFloor(MI, MRI, B); 2030 case TargetOpcode::G_BUILD_VECTOR: 2031 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2032 return legalizeBuildVector(MI, MRI, B); 2033 case TargetOpcode::G_MUL: 2034 return legalizeMul(Helper, MI); 2035 case TargetOpcode::G_CTLZ: 2036 case TargetOpcode::G_CTTZ: 2037 return legalizeCTLZ_CTTZ(MI, MRI, B); 2038 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 2039 return legalizeFPTruncRound(MI, B); 2040 default: 2041 return false; 2042 } 2043 2044 llvm_unreachable("expected switch to return"); 2045 } 2046 2047 Register AMDGPULegalizerInfo::getSegmentAperture( 2048 unsigned AS, 2049 MachineRegisterInfo &MRI, 2050 MachineIRBuilder &B) const { 2051 MachineFunction &MF = B.getMF(); 2052 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2053 const LLT S32 = LLT::scalar(32); 2054 const LLT S64 = LLT::scalar(64); 2055 2056 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2057 2058 if (ST.hasApertureRegs()) { 2059 // Note: this register is somewhat broken. When used as a 32-bit operand, 2060 // it only returns zeroes. The real value is in the upper 32 bits. 2061 // Thus, we must emit extract the high 32 bits. 2062 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2063 ? AMDGPU::SRC_SHARED_BASE 2064 : AMDGPU::SRC_PRIVATE_BASE; 2065 // FIXME: It would be more natural to emit a COPY here, but then copy 2066 // coalescing would kick in and it would think it's okay to use the "HI" 2067 // subregister (instead of extracting the HI 32 bits) which is an artificial 2068 // (unusable) register. 2069 // Register TableGen definitions would need an overhaul to get rid of the 2070 // artificial "HI" aperture registers and prevent this kind of issue from 2071 // happening. 2072 Register Dst = MRI.createGenericVirtualRegister(S64); 2073 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2074 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2075 return B.buildUnmerge(S32, Dst).getReg(1); 2076 } 2077 2078 // TODO: can we be smarter about machine pointer info? 2079 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2080 Register LoadAddr = MRI.createGenericVirtualRegister( 2081 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2082 // For code object version 5, private_base and shared_base are passed through 2083 // implicit kernargs. 2084 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 2085 AMDGPU::AMDHSA_COV5) { 2086 AMDGPUTargetLowering::ImplicitParameter Param = 2087 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2088 : AMDGPUTargetLowering::PRIVATE_BASE; 2089 uint64_t Offset = 2090 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2091 2092 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2093 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2094 2095 if (!loadInputValue(KernargPtrReg, B, 2096 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2097 return Register(); 2098 2099 MachineMemOperand *MMO = MF.getMachineMemOperand( 2100 PtrInfo, 2101 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2102 MachineMemOperand::MOInvariant, 2103 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2104 2105 // Pointer address 2106 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2107 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2108 // Load address 2109 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2110 } 2111 2112 Register QueuePtr = MRI.createGenericVirtualRegister( 2113 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2114 2115 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2116 return Register(); 2117 2118 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2119 // private_segment_aperture_base_hi. 2120 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2121 2122 MachineMemOperand *MMO = MF.getMachineMemOperand( 2123 PtrInfo, 2124 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2125 MachineMemOperand::MOInvariant, 2126 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2127 2128 B.buildPtrAdd(LoadAddr, QueuePtr, 2129 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2130 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2131 } 2132 2133 /// Return true if the value is a known valid address, such that a null check is 2134 /// not necessary. 2135 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2136 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2137 MachineInstr *Def = MRI.getVRegDef(Val); 2138 switch (Def->getOpcode()) { 2139 case AMDGPU::G_FRAME_INDEX: 2140 case AMDGPU::G_GLOBAL_VALUE: 2141 case AMDGPU::G_BLOCK_ADDR: 2142 return true; 2143 case AMDGPU::G_CONSTANT: { 2144 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2145 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2146 } 2147 default: 2148 return false; 2149 } 2150 2151 return false; 2152 } 2153 2154 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2155 MachineInstr &MI, MachineRegisterInfo &MRI, 2156 MachineIRBuilder &B) const { 2157 MachineFunction &MF = B.getMF(); 2158 2159 const LLT S32 = LLT::scalar(32); 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register Src = MI.getOperand(1).getReg(); 2162 2163 LLT DstTy = MRI.getType(Dst); 2164 LLT SrcTy = MRI.getType(Src); 2165 unsigned DestAS = DstTy.getAddressSpace(); 2166 unsigned SrcAS = SrcTy.getAddressSpace(); 2167 2168 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2169 // vector element. 2170 assert(!DstTy.isVector()); 2171 2172 const AMDGPUTargetMachine &TM 2173 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2174 2175 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2176 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2177 return true; 2178 } 2179 2180 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2181 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2182 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2183 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2184 // Extract low 32-bits of the pointer. 2185 B.buildExtract(Dst, Src, 0); 2186 MI.eraseFromParent(); 2187 return true; 2188 } 2189 2190 unsigned NullVal = TM.getNullPointerValue(DestAS); 2191 2192 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2193 auto FlatNull = B.buildConstant(SrcTy, 0); 2194 2195 // Extract low 32-bits of the pointer. 2196 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2197 2198 auto CmpRes = 2199 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2200 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2201 2202 MI.eraseFromParent(); 2203 return true; 2204 } 2205 2206 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2207 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2208 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2209 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2210 if (!ApertureReg.isValid()) 2211 return false; 2212 2213 // Coerce the type of the low half of the result so we can use merge_values. 2214 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2215 2216 // TODO: Should we allow mismatched types but matching sizes in merges to 2217 // avoid the ptrtoint? 2218 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 2219 2220 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 2221 B.buildCopy(Dst, BuildPtr); 2222 MI.eraseFromParent(); 2223 return true; 2224 } 2225 2226 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2227 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2228 2229 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2230 SegmentNull.getReg(0)); 2231 2232 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2233 2234 MI.eraseFromParent(); 2235 return true; 2236 } 2237 2238 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2239 SrcTy.getSizeInBits() == 64) { 2240 // Truncate. 2241 B.buildExtract(Dst, Src, 0); 2242 MI.eraseFromParent(); 2243 return true; 2244 } 2245 2246 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2247 DstTy.getSizeInBits() == 64) { 2248 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2249 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2250 auto PtrLo = B.buildPtrToInt(S32, Src); 2251 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2252 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2253 MI.eraseFromParent(); 2254 return true; 2255 } 2256 2257 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2258 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2259 2260 LLVMContext &Ctx = MF.getFunction().getContext(); 2261 Ctx.diagnose(InvalidAddrSpaceCast); 2262 B.buildUndef(Dst); 2263 MI.eraseFromParent(); 2264 return true; 2265 } 2266 2267 bool AMDGPULegalizerInfo::legalizeFrint( 2268 MachineInstr &MI, MachineRegisterInfo &MRI, 2269 MachineIRBuilder &B) const { 2270 Register Src = MI.getOperand(1).getReg(); 2271 LLT Ty = MRI.getType(Src); 2272 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2273 2274 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2275 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2276 2277 auto C1 = B.buildFConstant(Ty, C1Val); 2278 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2279 2280 // TODO: Should this propagate fast-math-flags? 2281 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2282 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2283 2284 auto C2 = B.buildFConstant(Ty, C2Val); 2285 auto Fabs = B.buildFAbs(Ty, Src); 2286 2287 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2288 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2289 MI.eraseFromParent(); 2290 return true; 2291 } 2292 2293 bool AMDGPULegalizerInfo::legalizeFceil( 2294 MachineInstr &MI, MachineRegisterInfo &MRI, 2295 MachineIRBuilder &B) const { 2296 2297 const LLT S1 = LLT::scalar(1); 2298 const LLT S64 = LLT::scalar(64); 2299 2300 Register Src = MI.getOperand(1).getReg(); 2301 assert(MRI.getType(Src) == S64); 2302 2303 // result = trunc(src) 2304 // if (src > 0.0 && src != result) 2305 // result += 1.0 2306 2307 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2308 2309 const auto Zero = B.buildFConstant(S64, 0.0); 2310 const auto One = B.buildFConstant(S64, 1.0); 2311 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2312 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2313 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2314 auto Add = B.buildSelect(S64, And, One, Zero); 2315 2316 // TODO: Should this propagate fast-math-flags? 2317 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2318 MI.eraseFromParent(); 2319 return true; 2320 } 2321 2322 bool AMDGPULegalizerInfo::legalizeFrem( 2323 MachineInstr &MI, MachineRegisterInfo &MRI, 2324 MachineIRBuilder &B) const { 2325 Register DstReg = MI.getOperand(0).getReg(); 2326 Register Src0Reg = MI.getOperand(1).getReg(); 2327 Register Src1Reg = MI.getOperand(2).getReg(); 2328 auto Flags = MI.getFlags(); 2329 LLT Ty = MRI.getType(DstReg); 2330 2331 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2332 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2333 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2334 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2335 MI.eraseFromParent(); 2336 return true; 2337 } 2338 2339 static MachineInstrBuilder extractF64Exponent(Register Hi, 2340 MachineIRBuilder &B) { 2341 const unsigned FractBits = 52; 2342 const unsigned ExpBits = 11; 2343 LLT S32 = LLT::scalar(32); 2344 2345 auto Const0 = B.buildConstant(S32, FractBits - 32); 2346 auto Const1 = B.buildConstant(S32, ExpBits); 2347 2348 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2349 .addUse(Hi) 2350 .addUse(Const0.getReg(0)) 2351 .addUse(Const1.getReg(0)); 2352 2353 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2354 } 2355 2356 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2357 MachineInstr &MI, MachineRegisterInfo &MRI, 2358 MachineIRBuilder &B) const { 2359 const LLT S1 = LLT::scalar(1); 2360 const LLT S32 = LLT::scalar(32); 2361 const LLT S64 = LLT::scalar(64); 2362 2363 Register Src = MI.getOperand(1).getReg(); 2364 assert(MRI.getType(Src) == S64); 2365 2366 // TODO: Should this use extract since the low half is unused? 2367 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2368 Register Hi = Unmerge.getReg(1); 2369 2370 // Extract the upper half, since this is where we will find the sign and 2371 // exponent. 2372 auto Exp = extractF64Exponent(Hi, B); 2373 2374 const unsigned FractBits = 52; 2375 2376 // Extract the sign bit. 2377 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2378 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2379 2380 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2381 2382 const auto Zero32 = B.buildConstant(S32, 0); 2383 2384 // Extend back to 64-bits. 2385 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2386 2387 auto Shr = B.buildAShr(S64, FractMask, Exp); 2388 auto Not = B.buildNot(S64, Shr); 2389 auto Tmp0 = B.buildAnd(S64, Src, Not); 2390 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2391 2392 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2393 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2394 2395 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2396 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeITOFP( 2402 MachineInstr &MI, MachineRegisterInfo &MRI, 2403 MachineIRBuilder &B, bool Signed) const { 2404 2405 Register Dst = MI.getOperand(0).getReg(); 2406 Register Src = MI.getOperand(1).getReg(); 2407 2408 const LLT S64 = LLT::scalar(64); 2409 const LLT S32 = LLT::scalar(32); 2410 2411 assert(MRI.getType(Src) == S64); 2412 2413 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2414 auto ThirtyTwo = B.buildConstant(S32, 32); 2415 2416 if (MRI.getType(Dst) == S64) { 2417 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2418 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2419 2420 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2421 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2422 2423 // TODO: Should this propagate fast-math-flags? 2424 B.buildFAdd(Dst, LdExp, CvtLo); 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 assert(MRI.getType(Dst) == S32); 2430 2431 auto One = B.buildConstant(S32, 1); 2432 2433 MachineInstrBuilder ShAmt; 2434 if (Signed) { 2435 auto ThirtyOne = B.buildConstant(S32, 31); 2436 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2437 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2438 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2439 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2440 /*HasSideEffects=*/false) 2441 .addUse(Unmerge.getReg(1)); 2442 auto LS2 = B.buildSub(S32, LS, One); 2443 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2444 } else 2445 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2446 auto Norm = B.buildShl(S64, Src, ShAmt); 2447 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2448 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2449 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2450 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2451 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2452 B.buildFLdexp(Dst, FVal, Scale); 2453 MI.eraseFromParent(); 2454 return true; 2455 } 2456 2457 // TODO: Copied from DAG implementation. Verify logic and document how this 2458 // actually works. 2459 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2460 MachineRegisterInfo &MRI, 2461 MachineIRBuilder &B, 2462 bool Signed) const { 2463 2464 Register Dst = MI.getOperand(0).getReg(); 2465 Register Src = MI.getOperand(1).getReg(); 2466 2467 const LLT S64 = LLT::scalar(64); 2468 const LLT S32 = LLT::scalar(32); 2469 2470 const LLT SrcLT = MRI.getType(Src); 2471 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2472 2473 unsigned Flags = MI.getFlags(); 2474 2475 // The basic idea of converting a floating point number into a pair of 32-bit 2476 // integers is illustrated as follows: 2477 // 2478 // tf := trunc(val); 2479 // hif := floor(tf * 2^-32); 2480 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2481 // hi := fptoi(hif); 2482 // lo := fptoi(lof); 2483 // 2484 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2485 MachineInstrBuilder Sign; 2486 if (Signed && SrcLT == S32) { 2487 // However, a 32-bit floating point number has only 23 bits mantissa and 2488 // it's not enough to hold all the significant bits of `lof` if val is 2489 // negative. To avoid the loss of precision, We need to take the absolute 2490 // value after truncating and flip the result back based on the original 2491 // signedness. 2492 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2493 Trunc = B.buildFAbs(S32, Trunc, Flags); 2494 } 2495 MachineInstrBuilder K0, K1; 2496 if (SrcLT == S64) { 2497 K0 = B.buildFConstant( 2498 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2499 K1 = B.buildFConstant( 2500 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2501 } else { 2502 K0 = B.buildFConstant( 2503 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2504 K1 = B.buildFConstant( 2505 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2506 } 2507 2508 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2509 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2510 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2511 2512 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2513 : B.buildFPTOUI(S32, FloorMul); 2514 auto Lo = B.buildFPTOUI(S32, Fma); 2515 2516 if (Signed && SrcLT == S32) { 2517 // Flip the result based on the signedness, which is either all 0s or 1s. 2518 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2519 // r := xor({lo, hi}, sign) - sign; 2520 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2521 Sign); 2522 } else 2523 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2524 MI.eraseFromParent(); 2525 2526 return true; 2527 } 2528 2529 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2530 MachineInstr &MI) const { 2531 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2532 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2533 2534 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2535 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2536 2537 // With ieee_mode disabled, the instructions have the correct behavior 2538 // already for G_FMINNUM/G_FMAXNUM 2539 if (!MFI->getMode().IEEE) 2540 return !IsIEEEOp; 2541 2542 if (IsIEEEOp) 2543 return true; 2544 2545 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2546 } 2547 2548 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2549 MachineInstr &MI, MachineRegisterInfo &MRI, 2550 MachineIRBuilder &B) const { 2551 // TODO: Should move some of this into LegalizerHelper. 2552 2553 // TODO: Promote dynamic indexing of s16 to s32 2554 2555 Register Dst = MI.getOperand(0).getReg(); 2556 Register Vec = MI.getOperand(1).getReg(); 2557 2558 LLT VecTy = MRI.getType(Vec); 2559 LLT EltTy = VecTy.getElementType(); 2560 assert(EltTy == MRI.getType(Dst)); 2561 2562 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2563 // but we can't go directly to that logic becasue you can't bitcast a vector 2564 // of pointers to a vector of integers. Therefore, introduce an intermediate 2565 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2566 // drive the legalization forward. 2567 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2568 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2569 LLT IntVecTy = VecTy.changeElementType(IntTy); 2570 2571 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2572 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2573 B.buildIntToPtr(Dst, IntElt); 2574 2575 MI.eraseFromParent(); 2576 return true; 2577 } 2578 2579 // FIXME: Artifact combiner probably should have replaced the truncated 2580 // constant before this, so we shouldn't need 2581 // getIConstantVRegValWithLookThrough. 2582 std::optional<ValueAndVReg> MaybeIdxVal = 2583 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2584 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2585 return true; 2586 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2587 2588 if (IdxVal < VecTy.getNumElements()) { 2589 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2590 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2591 } else { 2592 B.buildUndef(Dst); 2593 } 2594 2595 MI.eraseFromParent(); 2596 return true; 2597 } 2598 2599 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2600 MachineInstr &MI, MachineRegisterInfo &MRI, 2601 MachineIRBuilder &B) const { 2602 // TODO: Should move some of this into LegalizerHelper. 2603 2604 // TODO: Promote dynamic indexing of s16 to s32 2605 2606 Register Dst = MI.getOperand(0).getReg(); 2607 Register Vec = MI.getOperand(1).getReg(); 2608 Register Ins = MI.getOperand(2).getReg(); 2609 2610 LLT VecTy = MRI.getType(Vec); 2611 LLT EltTy = VecTy.getElementType(); 2612 assert(EltTy == MRI.getType(Ins)); 2613 2614 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2615 // but we can't go directly to that logic becasue you can't bitcast a vector 2616 // of pointers to a vector of integers. Therefore, make the pointer vector 2617 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2618 // new value, and then inttoptr the result vector back. This will then allow 2619 // the rest of legalization to take over. 2620 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2621 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2622 LLT IntVecTy = VecTy.changeElementType(IntTy); 2623 2624 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2625 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2626 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2627 MI.getOperand(3)); 2628 B.buildIntToPtr(Dst, IntVecDest); 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 // FIXME: Artifact combiner probably should have replaced the truncated 2634 // constant before this, so we shouldn't need 2635 // getIConstantVRegValWithLookThrough. 2636 std::optional<ValueAndVReg> MaybeIdxVal = 2637 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2638 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2639 return true; 2640 2641 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2642 2643 unsigned NumElts = VecTy.getNumElements(); 2644 if (IdxVal < NumElts) { 2645 SmallVector<Register, 8> SrcRegs; 2646 for (unsigned i = 0; i < NumElts; ++i) 2647 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2648 B.buildUnmerge(SrcRegs, Vec); 2649 2650 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2651 B.buildMergeLikeInstr(Dst, SrcRegs); 2652 } else { 2653 B.buildUndef(Dst); 2654 } 2655 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 bool AMDGPULegalizerInfo::legalizeSinCos( 2661 MachineInstr &MI, MachineRegisterInfo &MRI, 2662 MachineIRBuilder &B) const { 2663 2664 Register DstReg = MI.getOperand(0).getReg(); 2665 Register SrcReg = MI.getOperand(1).getReg(); 2666 LLT Ty = MRI.getType(DstReg); 2667 unsigned Flags = MI.getFlags(); 2668 2669 Register TrigVal; 2670 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2671 if (ST.hasTrigReducedRange()) { 2672 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2673 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2674 .addUse(MulVal.getReg(0)) 2675 .setMIFlags(Flags).getReg(0); 2676 } else 2677 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2678 2679 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2680 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2681 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) 2682 .addUse(TrigVal) 2683 .setMIFlags(Flags); 2684 MI.eraseFromParent(); 2685 return true; 2686 } 2687 2688 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2689 MachineIRBuilder &B, 2690 const GlobalValue *GV, 2691 int64_t Offset, 2692 unsigned GAFlags) const { 2693 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2694 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2695 // to the following code sequence: 2696 // 2697 // For constant address space: 2698 // s_getpc_b64 s[0:1] 2699 // s_add_u32 s0, s0, $symbol 2700 // s_addc_u32 s1, s1, 0 2701 // 2702 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2703 // a fixup or relocation is emitted to replace $symbol with a literal 2704 // constant, which is a pc-relative offset from the encoding of the $symbol 2705 // operand to the global variable. 2706 // 2707 // For global address space: 2708 // s_getpc_b64 s[0:1] 2709 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2710 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2711 // 2712 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2713 // fixups or relocations are emitted to replace $symbol@*@lo and 2714 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2715 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2716 // operand to the global variable. 2717 // 2718 // What we want here is an offset from the value returned by s_getpc 2719 // (which is the address of the s_add_u32 instruction) to the global 2720 // variable, but since the encoding of $symbol starts 4 bytes after the start 2721 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2722 // small. This requires us to add 4 to the global variable offset in order to 2723 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2724 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2725 // instruction. 2726 2727 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2728 2729 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2730 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2731 2732 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2733 .addDef(PCReg); 2734 2735 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2736 if (GAFlags == SIInstrInfo::MO_NONE) 2737 MIB.addImm(0); 2738 else 2739 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2740 2741 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2742 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2743 2744 if (PtrTy.getSizeInBits() == 32) 2745 B.buildExtract(DstReg, PCReg, 0); 2746 return true; 2747 } 2748 2749 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2750 MachineInstr &MI, MachineRegisterInfo &MRI, 2751 MachineIRBuilder &B) const { 2752 Register DstReg = MI.getOperand(0).getReg(); 2753 LLT Ty = MRI.getType(DstReg); 2754 unsigned AS = Ty.getAddressSpace(); 2755 2756 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2757 MachineFunction &MF = B.getMF(); 2758 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2759 2760 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2761 if (!MFI->isModuleEntryFunction() && 2762 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2763 const Function &Fn = MF.getFunction(); 2764 DiagnosticInfoUnsupported BadLDSDecl( 2765 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2766 DS_Warning); 2767 Fn.getContext().diagnose(BadLDSDecl); 2768 2769 // We currently don't have a way to correctly allocate LDS objects that 2770 // aren't directly associated with a kernel. We do force inlining of 2771 // functions that use local objects. However, if these dead functions are 2772 // not eliminated, we don't want a compile time error. Just emit a warning 2773 // and a trap, since there should be no callable path here. 2774 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2775 B.buildUndef(DstReg); 2776 MI.eraseFromParent(); 2777 return true; 2778 } 2779 2780 // TODO: We could emit code to handle the initialization somewhere. 2781 // We ignore the initializer for now and legalize it to allow selection. 2782 // The initializer will anyway get errored out during assembly emission. 2783 const SITargetLowering *TLI = ST.getTargetLowering(); 2784 if (!TLI->shouldUseLDSConstAddress(GV)) { 2785 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2786 return true; // Leave in place; 2787 } 2788 2789 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2790 Type *Ty = GV->getValueType(); 2791 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2792 // zero-sized type in other languages to declare the dynamic shared 2793 // memory which size is not known at the compile time. They will be 2794 // allocated by the runtime and placed directly after the static 2795 // allocated ones. They all share the same offset. 2796 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2797 // Adjust alignment for that dynamic shared memory array. 2798 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2799 LLT S32 = LLT::scalar(32); 2800 auto Sz = 2801 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2802 B.buildIntToPtr(DstReg, Sz); 2803 MI.eraseFromParent(); 2804 return true; 2805 } 2806 } 2807 2808 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2809 *cast<GlobalVariable>(GV))); 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 2814 const SITargetLowering *TLI = ST.getTargetLowering(); 2815 2816 if (TLI->shouldEmitFixup(GV)) { 2817 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2818 MI.eraseFromParent(); 2819 return true; 2820 } 2821 2822 if (TLI->shouldEmitPCReloc(GV)) { 2823 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2824 MI.eraseFromParent(); 2825 return true; 2826 } 2827 2828 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2829 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2830 2831 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2832 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2833 MachinePointerInfo::getGOT(MF), 2834 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2835 MachineMemOperand::MOInvariant, 2836 LoadTy, Align(8)); 2837 2838 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2839 2840 if (Ty.getSizeInBits() == 32) { 2841 // Truncate if this is a 32-bit constant address. 2842 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2843 B.buildExtract(DstReg, Load, 0); 2844 } else 2845 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2846 2847 MI.eraseFromParent(); 2848 return true; 2849 } 2850 2851 static LLT widenToNextPowerOf2(LLT Ty) { 2852 if (Ty.isVector()) 2853 return Ty.changeElementCount( 2854 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2855 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2856 } 2857 2858 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2859 MachineInstr &MI) const { 2860 MachineIRBuilder &B = Helper.MIRBuilder; 2861 MachineRegisterInfo &MRI = *B.getMRI(); 2862 GISelChangeObserver &Observer = Helper.Observer; 2863 2864 Register PtrReg = MI.getOperand(1).getReg(); 2865 LLT PtrTy = MRI.getType(PtrReg); 2866 unsigned AddrSpace = PtrTy.getAddressSpace(); 2867 2868 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2869 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2870 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2871 Observer.changingInstr(MI); 2872 MI.getOperand(1).setReg(Cast.getReg(0)); 2873 Observer.changedInstr(MI); 2874 return true; 2875 } 2876 2877 if (MI.getOpcode() != AMDGPU::G_LOAD) 2878 return false; 2879 2880 Register ValReg = MI.getOperand(0).getReg(); 2881 LLT ValTy = MRI.getType(ValReg); 2882 2883 if (hasBufferRsrcWorkaround(ValTy)) { 2884 Observer.changingInstr(MI); 2885 castBufferRsrcFromV4I32(MI, B, MRI, 0); 2886 Observer.changedInstr(MI); 2887 return true; 2888 } 2889 2890 MachineMemOperand *MMO = *MI.memoperands_begin(); 2891 const unsigned ValSize = ValTy.getSizeInBits(); 2892 const LLT MemTy = MMO->getMemoryType(); 2893 const Align MemAlign = MMO->getAlign(); 2894 const unsigned MemSize = MemTy.getSizeInBits(); 2895 const uint64_t AlignInBits = 8 * MemAlign.value(); 2896 2897 // Widen non-power-of-2 loads to the alignment if needed 2898 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2899 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2900 2901 // This was already the correct extending load result type, so just adjust 2902 // the memory type. 2903 if (WideMemSize == ValSize) { 2904 MachineFunction &MF = B.getMF(); 2905 2906 MachineMemOperand *WideMMO = 2907 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2908 Observer.changingInstr(MI); 2909 MI.setMemRefs(MF, {WideMMO}); 2910 Observer.changedInstr(MI); 2911 return true; 2912 } 2913 2914 // Don't bother handling edge case that should probably never be produced. 2915 if (ValSize > WideMemSize) 2916 return false; 2917 2918 LLT WideTy = widenToNextPowerOf2(ValTy); 2919 2920 Register WideLoad; 2921 if (!WideTy.isVector()) { 2922 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2923 B.buildTrunc(ValReg, WideLoad).getReg(0); 2924 } else { 2925 // Extract the subvector. 2926 2927 if (isRegisterType(ValTy)) { 2928 // If this a case where G_EXTRACT is legal, use it. 2929 // (e.g. <3 x s32> -> <4 x s32>) 2930 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2931 B.buildExtract(ValReg, WideLoad, 0); 2932 } else { 2933 // For cases where the widened type isn't a nice register value, unmerge 2934 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2935 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2936 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2937 } 2938 } 2939 2940 MI.eraseFromParent(); 2941 return true; 2942 } 2943 2944 return false; 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 2948 MachineInstr &MI) const { 2949 MachineIRBuilder &B = Helper.MIRBuilder; 2950 MachineRegisterInfo &MRI = *B.getMRI(); 2951 GISelChangeObserver &Observer = Helper.Observer; 2952 2953 Register DataReg = MI.getOperand(0).getReg(); 2954 LLT DataTy = MRI.getType(DataReg); 2955 2956 if (hasBufferRsrcWorkaround(DataTy)) { 2957 Observer.changingInstr(MI); 2958 castBufferRsrcArgToV4I32(MI, B, 0); 2959 Observer.changedInstr(MI); 2960 return true; 2961 } 2962 return false; 2963 } 2964 2965 bool AMDGPULegalizerInfo::legalizeFMad( 2966 MachineInstr &MI, MachineRegisterInfo &MRI, 2967 MachineIRBuilder &B) const { 2968 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2969 assert(Ty.isScalar()); 2970 2971 MachineFunction &MF = B.getMF(); 2972 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2973 2974 // TODO: Always legal with future ftz flag. 2975 // FIXME: Do we need just output? 2976 if (Ty == LLT::scalar(32) && 2977 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 2978 return true; 2979 if (Ty == LLT::scalar(16) && 2980 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 2981 return true; 2982 2983 MachineIRBuilder HelperBuilder(MI); 2984 GISelObserverWrapper DummyObserver; 2985 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2986 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2987 } 2988 2989 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2990 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2991 Register DstReg = MI.getOperand(0).getReg(); 2992 Register PtrReg = MI.getOperand(1).getReg(); 2993 Register CmpVal = MI.getOperand(2).getReg(); 2994 Register NewVal = MI.getOperand(3).getReg(); 2995 2996 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2997 "this should not have been custom lowered"); 2998 2999 LLT ValTy = MRI.getType(CmpVal); 3000 LLT VecTy = LLT::fixed_vector(2, ValTy); 3001 3002 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3003 3004 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3005 .addDef(DstReg) 3006 .addUse(PtrReg) 3007 .addUse(PackedVal) 3008 .setMemRefs(MI.memoperands()); 3009 3010 MI.eraseFromParent(); 3011 return true; 3012 } 3013 3014 /// Return true if it's known that \p Src can never be an f32 denormal value. 3015 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3016 Register Src) { 3017 Register ExtSrc; 3018 if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) 3019 return MRI.getType(ExtSrc) == LLT::scalar(16); 3020 return false; 3021 } 3022 3023 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3024 if (Flags & MachineInstr::FmAfn) 3025 return true; 3026 const auto &Options = MF.getTarget().Options; 3027 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3028 } 3029 3030 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3031 unsigned Flags) { 3032 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3033 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3034 DenormalMode::PreserveSign; 3035 } 3036 3037 std::pair<Register, Register> 3038 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3039 unsigned Flags) const { 3040 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3041 return {}; 3042 3043 const LLT F32 = LLT::scalar(32); 3044 auto SmallestNormal = B.buildFConstant( 3045 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3046 auto IsLtSmallestNormal = 3047 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3048 3049 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3050 auto One = B.buildFConstant(F32, 1.0); 3051 auto ScaleFactor = 3052 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3053 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3054 3055 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3056 } 3057 3058 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3059 MachineIRBuilder &B) const { 3060 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3061 // If we have to handle denormals, scale up the input and adjust the result. 3062 3063 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3064 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3065 3066 Register Dst = MI.getOperand(0).getReg(); 3067 Register Src = MI.getOperand(1).getReg(); 3068 LLT Ty = B.getMRI()->getType(Dst); 3069 unsigned Flags = MI.getFlags(); 3070 3071 if (Ty == LLT::scalar(16)) { 3072 const LLT F32 = LLT::scalar(32); 3073 // Nothing in half is a denormal when promoted to f32. 3074 auto Ext = B.buildFPExt(F32, Src, Flags); 3075 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) 3076 .addUse(Ext.getReg(0)) 3077 .setMIFlags(Flags); 3078 B.buildFPTrunc(Dst, Log2, Flags); 3079 MI.eraseFromParent(); 3080 return true; 3081 } 3082 3083 assert(Ty == LLT::scalar(32)); 3084 3085 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3086 if (!ScaledInput) { 3087 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false) 3088 .addUse(Src) 3089 .setMIFlags(Flags); 3090 MI.eraseFromParent(); 3091 return true; 3092 } 3093 3094 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3095 .addUse(ScaledInput) 3096 .setMIFlags(Flags); 3097 3098 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3099 auto Zero = B.buildFConstant(Ty, 0.0); 3100 auto ResultOffset = 3101 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3102 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3103 3104 MI.eraseFromParent(); 3105 return true; 3106 } 3107 3108 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3109 Register Z, unsigned Flags) { 3110 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3111 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3112 } 3113 3114 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3115 MachineIRBuilder &B) const { 3116 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3117 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3118 3119 MachineRegisterInfo &MRI = *B.getMRI(); 3120 Register Dst = MI.getOperand(0).getReg(); 3121 Register X = MI.getOperand(1).getReg(); 3122 unsigned Flags = MI.getFlags(); 3123 const LLT Ty = MRI.getType(X); 3124 MachineFunction &MF = B.getMF(); 3125 3126 const LLT F32 = LLT::scalar(32); 3127 const LLT F16 = LLT::scalar(16); 3128 3129 const AMDGPUTargetMachine &TM = 3130 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3131 3132 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3133 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3134 if (Ty == F16 && !ST.has16BitInsts()) { 3135 Register LogVal = MRI.createGenericVirtualRegister(F32); 3136 auto PromoteSrc = B.buildFPExt(F32, X); 3137 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3138 B.buildFPTrunc(Dst, LogVal); 3139 } else { 3140 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3141 } 3142 3143 MI.eraseFromParent(); 3144 return true; 3145 } 3146 3147 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3148 if (ScaledInput) 3149 X = ScaledInput; 3150 3151 auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3152 .addUse(X) 3153 .setMIFlags(Flags); 3154 3155 Register R; 3156 if (ST.hasFastFMAF32()) { 3157 // c+cc are ln(2)/ln(10) to more than 49 bits 3158 const float c_log10 = 0x1.344134p-2f; 3159 const float cc_log10 = 0x1.09f79ep-26f; 3160 3161 // c + cc is ln(2) to more than 49 bits 3162 const float c_log = 0x1.62e42ep-1f; 3163 const float cc_log = 0x1.efa39ep-25f; 3164 3165 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3166 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3167 3168 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3169 auto NegR = B.buildFNeg(Ty, R, Flags); 3170 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3171 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3172 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3173 } else { 3174 // ch+ct is ln(2)/ln(10) to more than 36 bits 3175 const float ch_log10 = 0x1.344000p-2f; 3176 const float ct_log10 = 0x1.3509f6p-18f; 3177 3178 // ch + ct is ln(2) to more than 36 bits 3179 const float ch_log = 0x1.62e000p-1f; 3180 const float ct_log = 0x1.0bfbe8p-15f; 3181 3182 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3183 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3184 3185 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3186 auto YH = B.buildAnd(Ty, Y, MaskConst); 3187 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3188 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3189 3190 Register Mad0 = 3191 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3192 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3193 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3194 } 3195 3196 const bool IsFiniteOnly = 3197 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3198 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3199 3200 if (!IsFiniteOnly) { 3201 // Expand isfinite(x) => fabs(x) < inf 3202 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3203 auto Fabs = B.buildFAbs(Ty, Y); 3204 auto IsFinite = 3205 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3206 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3207 } 3208 3209 if (ScaledInput) { 3210 auto Zero = B.buildFConstant(Ty, 0.0); 3211 auto ShiftK = 3212 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3213 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3214 B.buildFSub(Dst, R, Shift, Flags); 3215 } else { 3216 B.buildCopy(Dst, R); 3217 } 3218 3219 MI.eraseFromParent(); 3220 return true; 3221 } 3222 3223 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3224 Register Src, bool IsLog10, 3225 unsigned Flags) const { 3226 const double Log2BaseInverted = 3227 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3228 3229 LLT Ty = B.getMRI()->getType(Dst); 3230 3231 if (Ty == LLT::scalar(32)) { 3232 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3233 if (ScaledInput) { 3234 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3235 .addUse(Src) 3236 .setMIFlags(Flags); 3237 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3238 auto Zero = B.buildFConstant(Ty, 0.0); 3239 auto ResultOffset = 3240 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3241 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3242 3243 if (ST.hasFastFMAF32()) 3244 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3245 else { 3246 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3247 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3248 } 3249 3250 return true; 3251 } 3252 } 3253 3254 auto Log2Operand = Ty == LLT::scalar(16) 3255 ? B.buildFLog2(Ty, Src, Flags) 3256 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3257 .addUse(Src) 3258 .setMIFlags(Flags); 3259 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3260 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3261 return true; 3262 } 3263 3264 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3265 MachineIRBuilder &B) const { 3266 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3267 // If we have to handle denormals, scale up the input and adjust the result. 3268 3269 Register Dst = MI.getOperand(0).getReg(); 3270 Register Src = MI.getOperand(1).getReg(); 3271 unsigned Flags = MI.getFlags(); 3272 LLT Ty = B.getMRI()->getType(Dst); 3273 const LLT F16 = LLT::scalar(16); 3274 const LLT F32 = LLT::scalar(32); 3275 3276 if (Ty == F16) { 3277 // Nothing in half is a denormal when promoted to f32. 3278 auto Ext = B.buildFPExt(F32, Src, Flags); 3279 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) 3280 .addUse(Ext.getReg(0)) 3281 .setMIFlags(Flags); 3282 B.buildFPTrunc(Dst, Log2, Flags); 3283 MI.eraseFromParent(); 3284 return true; 3285 } 3286 3287 assert(Ty == F32); 3288 3289 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3290 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 3291 .addUse(Src) 3292 .setMIFlags(Flags); 3293 MI.eraseFromParent(); 3294 return true; 3295 } 3296 3297 // bool needs_scaling = x < -0x1.f80000p+6f; 3298 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3299 3300 // -nextafter(128.0, -1) 3301 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3302 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3303 RangeCheckConst, Flags); 3304 3305 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3306 auto Zero = B.buildFConstant(Ty, 0.0); 3307 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3308 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3309 3310 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 3311 .addUse(AddInput.getReg(0)) 3312 .setMIFlags(Flags); 3313 3314 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3315 auto One = B.buildFConstant(Ty, 1.0); 3316 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3317 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3318 MI.eraseFromParent(); 3319 return true; 3320 } 3321 3322 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3323 Register Src, 3324 unsigned Flags) const { 3325 LLT Ty = B.getMRI()->getType(Dst); 3326 auto K = B.buildFConstant(Ty, numbers::log2e); 3327 auto Mul = B.buildFMul(Ty, Src, K, Flags); 3328 3329 if (Ty == LLT::scalar(32)) { 3330 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 3331 .addUse(Mul.getReg(0)) 3332 .setMIFlags(Flags); 3333 } else { 3334 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3335 } 3336 3337 return true; 3338 } 3339 3340 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3341 MachineIRBuilder &B) const { 3342 Register Dst = MI.getOperand(0).getReg(); 3343 Register X = MI.getOperand(1).getReg(); 3344 const unsigned Flags = MI.getFlags(); 3345 MachineFunction &MF = B.getMF(); 3346 MachineRegisterInfo &MRI = *B.getMRI(); 3347 LLT Ty = MRI.getType(Dst); 3348 const LLT F16 = LLT::scalar(16); 3349 const LLT F32 = LLT::scalar(32); 3350 const bool IsExp10 = false; // TODO: For some reason exp10 is missing 3351 3352 if (Ty == F16) { 3353 // v_exp_f16 (fmul x, log2e) 3354 if (allowApproxFunc(MF, Flags)) { 3355 // TODO: Does this really require fast? 3356 legalizeFExpUnsafe(B, Dst, X, Flags); 3357 MI.eraseFromParent(); 3358 return true; 3359 } 3360 3361 // exp(f16 x) -> 3362 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3363 3364 // Nothing in half is a denormal when promoted to f32. 3365 auto Ext = B.buildFPExt(F32, X, Flags); 3366 Register Lowered = MRI.createGenericVirtualRegister(F32); 3367 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3368 B.buildFPTrunc(Dst, Lowered, Flags); 3369 MI.eraseFromParent(); 3370 return true; 3371 } 3372 3373 assert(Ty == F32); 3374 3375 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3376 // library behavior. Also, is known-not-daz source sufficient? 3377 if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) { 3378 legalizeFExpUnsafe(B, Dst, X, Flags); 3379 MI.eraseFromParent(); 3380 return true; 3381 } 3382 3383 // Algorithm: 3384 // 3385 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3386 // 3387 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3388 // n = 64*m + j, 0 <= j < 64 3389 // 3390 // e^x = 2^((64*m + j + f)/64) 3391 // = (2^m) * (2^(j/64)) * 2^(f/64) 3392 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3393 // 3394 // f = x*(64/ln(2)) - n 3395 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3396 // 3397 // e^x = (2^m) * (2^(j/64)) * e^r 3398 // 3399 // (2^(j/64)) is precomputed 3400 // 3401 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3402 // e^r = 1 + q 3403 // 3404 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3405 // 3406 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3407 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3408 Register PH, PL; 3409 3410 if (ST.hasFastFMAF32()) { 3411 const float c_exp = numbers::log2ef; 3412 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3413 const float c_exp10 = 0x1.a934f0p+1f; 3414 const float cc_exp10 = 0x1.2f346ep-24f; 3415 3416 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3417 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3418 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3419 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3420 3421 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3422 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3423 } else { 3424 const float ch_exp = 0x1.714000p+0f; 3425 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3426 3427 const float ch_exp10 = 0x1.a92000p+1f; 3428 const float cl_exp10 = 0x1.4f0978p-11f; 3429 3430 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3431 auto XH = B.buildAnd(Ty, X, MaskConst); 3432 auto XL = B.buildFSub(Ty, X, XH, Flags); 3433 3434 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3435 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3436 3437 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3438 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3439 3440 Register Mad0 = 3441 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3442 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3443 } 3444 3445 auto E = B.buildFRint(Ty, PH, Flags); 3446 3447 // It is unsafe to contract this fsub into the PH multiply. 3448 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3449 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3450 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3451 3452 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 3453 .addUse(A.getReg(0)) 3454 .setMIFlags(Flags); 3455 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3456 3457 auto UnderflowCheckConst = 3458 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3459 auto Zero = B.buildFConstant(Ty, 0.0); 3460 auto Underflow = 3461 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3462 3463 R = B.buildSelect(Ty, Underflow, Zero, R); 3464 3465 const auto &Options = MF.getTarget().Options; 3466 3467 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3468 auto OverflowCheckConst = 3469 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3470 3471 auto Overflow = 3472 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3473 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3474 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3475 } 3476 3477 B.buildCopy(Dst, R); 3478 MI.eraseFromParent(); 3479 return true; 3480 } 3481 3482 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3483 MachineIRBuilder &B) const { 3484 Register Dst = MI.getOperand(0).getReg(); 3485 Register Src0 = MI.getOperand(1).getReg(); 3486 Register Src1 = MI.getOperand(2).getReg(); 3487 unsigned Flags = MI.getFlags(); 3488 LLT Ty = B.getMRI()->getType(Dst); 3489 const LLT S16 = LLT::scalar(16); 3490 const LLT S32 = LLT::scalar(32); 3491 3492 if (Ty == S32) { 3493 auto Log = B.buildFLog2(S32, Src0, Flags); 3494 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 3495 .addUse(Log.getReg(0)) 3496 .addUse(Src1) 3497 .setMIFlags(Flags); 3498 B.buildFExp2(Dst, Mul, Flags); 3499 } else if (Ty == S16) { 3500 // There's no f16 fmul_legacy, so we need to convert for it. 3501 auto Log = B.buildFLog2(S16, Src0, Flags); 3502 auto Ext0 = B.buildFPExt(S32, Log, Flags); 3503 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 3504 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 3505 .addUse(Ext0.getReg(0)) 3506 .addUse(Ext1.getReg(0)) 3507 .setMIFlags(Flags); 3508 3509 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 3510 } else 3511 return false; 3512 3513 MI.eraseFromParent(); 3514 return true; 3515 } 3516 3517 // Find a source register, ignoring any possible source modifiers. 3518 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3519 Register ModSrc = OrigSrc; 3520 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3521 ModSrc = SrcFNeg->getOperand(1).getReg(); 3522 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3523 ModSrc = SrcFAbs->getOperand(1).getReg(); 3524 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3525 ModSrc = SrcFAbs->getOperand(1).getReg(); 3526 return ModSrc; 3527 } 3528 3529 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3530 MachineRegisterInfo &MRI, 3531 MachineIRBuilder &B) const { 3532 3533 const LLT S1 = LLT::scalar(1); 3534 const LLT S64 = LLT::scalar(64); 3535 Register Dst = MI.getOperand(0).getReg(); 3536 Register OrigSrc = MI.getOperand(1).getReg(); 3537 unsigned Flags = MI.getFlags(); 3538 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 3539 "this should not have been custom lowered"); 3540 3541 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3542 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3543 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3544 // V_FRACT bug is: 3545 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3546 // 3547 // Convert floor(x) to (x - fract(x)) 3548 3549 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 3550 .addUse(OrigSrc) 3551 .setMIFlags(Flags); 3552 3553 // Give source modifier matching some assistance before obscuring a foldable 3554 // pattern. 3555 3556 // TODO: We can avoid the neg on the fract? The input sign to fract 3557 // shouldn't matter? 3558 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3559 3560 auto Const = 3561 B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3562 3563 Register Min = MRI.createGenericVirtualRegister(S64); 3564 3565 // We don't need to concern ourselves with the snan handling difference, so 3566 // use the one which will directly select. 3567 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3568 if (MFI->getMode().IEEE) 3569 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3570 else 3571 B.buildFMinNum(Min, Fract, Const, Flags); 3572 3573 Register CorrectedFract = Min; 3574 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3575 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3576 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 3577 } 3578 3579 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 3580 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3581 3582 MI.eraseFromParent(); 3583 return true; 3584 } 3585 3586 // Turn an illegal packed v2s16 build vector into bit operations. 3587 // TODO: This should probably be a bitcast action in LegalizerHelper. 3588 bool AMDGPULegalizerInfo::legalizeBuildVector( 3589 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3590 Register Dst = MI.getOperand(0).getReg(); 3591 const LLT S32 = LLT::scalar(32); 3592 const LLT S16 = LLT::scalar(16); 3593 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3594 3595 Register Src0 = MI.getOperand(1).getReg(); 3596 Register Src1 = MI.getOperand(2).getReg(); 3597 3598 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3599 assert(MRI.getType(Src0) == S32); 3600 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3601 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3602 } 3603 3604 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3605 B.buildBitcast(Dst, Merge); 3606 3607 MI.eraseFromParent(); 3608 return true; 3609 } 3610 3611 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3612 // 3613 // Source and accumulation registers must all be 32-bits. 3614 // 3615 // TODO: When the multiply is uniform, we should produce a code sequence 3616 // that is better suited to instruction selection on the SALU. Instead of 3617 // the outer loop going over parts of the result, the outer loop should go 3618 // over parts of one of the factors. This should result in instruction 3619 // selection that makes full use of S_ADDC_U32 instructions. 3620 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3621 MutableArrayRef<Register> Accum, 3622 ArrayRef<Register> Src0, 3623 ArrayRef<Register> Src1, 3624 bool UsePartialMad64_32, 3625 bool SeparateOddAlignedProducts) const { 3626 // Use (possibly empty) vectors of S1 registers to represent the set of 3627 // carries from one pair of positions to the next. 3628 using Carry = SmallVector<Register, 2>; 3629 3630 MachineIRBuilder &B = Helper.MIRBuilder; 3631 GISelKnownBits &KB = *Helper.getKnownBits(); 3632 3633 const LLT S1 = LLT::scalar(1); 3634 const LLT S32 = LLT::scalar(32); 3635 const LLT S64 = LLT::scalar(64); 3636 3637 Register Zero32; 3638 Register Zero64; 3639 3640 auto getZero32 = [&]() -> Register { 3641 if (!Zero32) 3642 Zero32 = B.buildConstant(S32, 0).getReg(0); 3643 return Zero32; 3644 }; 3645 auto getZero64 = [&]() -> Register { 3646 if (!Zero64) 3647 Zero64 = B.buildConstant(S64, 0).getReg(0); 3648 return Zero64; 3649 }; 3650 3651 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3652 for (unsigned i = 0; i < Src0.size(); ++i) { 3653 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 3654 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 3655 } 3656 3657 // Merge the given carries into the 32-bit LocalAccum, which is modified 3658 // in-place. 3659 // 3660 // Returns the carry-out, which is a single S1 register or null. 3661 auto mergeCarry = 3662 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3663 if (CarryIn.empty()) 3664 return Register(); 3665 3666 bool HaveCarryOut = true; 3667 Register CarryAccum; 3668 if (CarryIn.size() == 1) { 3669 if (!LocalAccum) { 3670 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3671 return Register(); 3672 } 3673 3674 CarryAccum = getZero32(); 3675 } else { 3676 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3677 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3678 CarryAccum = 3679 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3680 .getReg(0); 3681 } 3682 3683 if (!LocalAccum) { 3684 LocalAccum = getZero32(); 3685 HaveCarryOut = false; 3686 } 3687 } 3688 3689 auto Add = 3690 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3691 LocalAccum = Add.getReg(0); 3692 return HaveCarryOut ? Add.getReg(1) : Register(); 3693 }; 3694 3695 // Build a multiply-add chain to compute 3696 // 3697 // LocalAccum + (partial products at DstIndex) 3698 // + (opportunistic subset of CarryIn) 3699 // 3700 // LocalAccum is an array of one or two 32-bit registers that are updated 3701 // in-place. The incoming registers may be null. 3702 // 3703 // In some edge cases, carry-ins can be consumed "for free". In that case, 3704 // the consumed carry bits are removed from CarryIn in-place. 3705 auto buildMadChain = 3706 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 3707 -> Carry { 3708 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 3709 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 3710 3711 Carry CarryOut; 3712 unsigned j0 = 0; 3713 3714 // Use plain 32-bit multiplication for the most significant part of the 3715 // result by default. 3716 if (LocalAccum.size() == 1 && 3717 (!UsePartialMad64_32 || !CarryIn.empty())) { 3718 do { 3719 // Skip multiplication if one of the operands is 0 3720 unsigned j1 = DstIndex - j0; 3721 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3722 ++j0; 3723 continue; 3724 } 3725 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3726 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 3727 LocalAccum[0] = Mul.getReg(0); 3728 } else { 3729 if (CarryIn.empty()) { 3730 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3731 } else { 3732 LocalAccum[0] = 3733 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3734 .getReg(0); 3735 CarryIn.pop_back(); 3736 } 3737 } 3738 ++j0; 3739 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3740 } 3741 3742 // Build full 64-bit multiplies. 3743 if (j0 <= DstIndex) { 3744 bool HaveSmallAccum = false; 3745 Register Tmp; 3746 3747 if (LocalAccum[0]) { 3748 if (LocalAccum.size() == 1) { 3749 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3750 HaveSmallAccum = true; 3751 } else if (LocalAccum[1]) { 3752 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 3753 HaveSmallAccum = false; 3754 } else { 3755 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3756 HaveSmallAccum = true; 3757 } 3758 } else { 3759 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3760 Tmp = getZero64(); 3761 HaveSmallAccum = true; 3762 } 3763 3764 do { 3765 unsigned j1 = DstIndex - j0; 3766 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 3767 ++j0; 3768 continue; 3769 } 3770 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3771 {Src0[j0], Src1[j1], Tmp}); 3772 Tmp = Mad.getReg(0); 3773 if (!HaveSmallAccum) 3774 CarryOut.push_back(Mad.getReg(1)); 3775 HaveSmallAccum = false; 3776 3777 ++j0; 3778 } while (j0 <= DstIndex); 3779 3780 auto Unmerge = B.buildUnmerge(S32, Tmp); 3781 LocalAccum[0] = Unmerge.getReg(0); 3782 if (LocalAccum.size() > 1) 3783 LocalAccum[1] = Unmerge.getReg(1); 3784 } 3785 3786 return CarryOut; 3787 }; 3788 3789 // Outer multiply loop, iterating over destination parts from least 3790 // significant to most significant parts. 3791 // 3792 // The columns of the following diagram correspond to the destination parts 3793 // affected by one iteration of the outer loop (ignoring boundary 3794 // conditions). 3795 // 3796 // Dest index relative to 2 * i: 1 0 -1 3797 // ------ 3798 // Carries from previous iteration: e o 3799 // Even-aligned partial product sum: E E . 3800 // Odd-aligned partial product sum: O O 3801 // 3802 // 'o' is OddCarry, 'e' is EvenCarry. 3803 // EE and OO are computed from partial products via buildMadChain and use 3804 // accumulation where possible and appropriate. 3805 // 3806 Register SeparateOddCarry; 3807 Carry EvenCarry; 3808 Carry OddCarry; 3809 3810 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3811 Carry OddCarryIn = std::move(OddCarry); 3812 Carry EvenCarryIn = std::move(EvenCarry); 3813 OddCarry.clear(); 3814 EvenCarry.clear(); 3815 3816 // Partial products at offset 2 * i. 3817 if (2 * i < Accum.size()) { 3818 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3819 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3820 } 3821 3822 // Partial products at offset 2 * i - 1. 3823 if (i > 0) { 3824 if (!SeparateOddAlignedProducts) { 3825 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3826 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3827 } else { 3828 bool IsHighest = 2 * i >= Accum.size(); 3829 Register SeparateOddOut[2]; 3830 auto LocalAccum = MutableArrayRef(SeparateOddOut) 3831 .take_front(IsHighest ? 1 : 2); 3832 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3833 3834 MachineInstr *Lo; 3835 3836 if (i == 1) { 3837 if (!IsHighest) 3838 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3839 else 3840 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3841 } else { 3842 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3843 SeparateOddCarry); 3844 } 3845 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3846 3847 if (!IsHighest) { 3848 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3849 Lo->getOperand(1).getReg()); 3850 Accum[2 * i] = Hi.getReg(0); 3851 SeparateOddCarry = Hi.getReg(1); 3852 } 3853 } 3854 } 3855 3856 // Add in the carries from the previous iteration 3857 if (i > 0) { 3858 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3859 EvenCarryIn.push_back(CarryOut); 3860 3861 if (2 * i < Accum.size()) { 3862 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3863 OddCarry.push_back(CarryOut); 3864 } 3865 } 3866 } 3867 } 3868 3869 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3870 // 3871 // TODO: If the multiply is followed by an addition, we should attempt to 3872 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3873 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3874 MachineInstr &MI) const { 3875 assert(ST.hasMad64_32()); 3876 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3877 3878 MachineIRBuilder &B = Helper.MIRBuilder; 3879 MachineRegisterInfo &MRI = *B.getMRI(); 3880 3881 Register DstReg = MI.getOperand(0).getReg(); 3882 Register Src0 = MI.getOperand(1).getReg(); 3883 Register Src1 = MI.getOperand(2).getReg(); 3884 3885 LLT Ty = MRI.getType(DstReg); 3886 assert(Ty.isScalar()); 3887 3888 unsigned Size = Ty.getSizeInBits(); 3889 unsigned NumParts = Size / 32; 3890 assert((Size % 32) == 0); 3891 assert(NumParts >= 2); 3892 3893 // Whether to use MAD_64_32 for partial products whose high half is 3894 // discarded. This avoids some ADD instructions but risks false dependency 3895 // stalls on some subtargets in some cases. 3896 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3897 3898 // Whether to compute odd-aligned partial products separately. This is 3899 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3900 // in an even-aligned VGPR. 3901 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3902 3903 LLT S32 = LLT::scalar(32); 3904 SmallVector<Register, 2> Src0Parts, Src1Parts; 3905 for (unsigned i = 0; i < NumParts; ++i) { 3906 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3907 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3908 } 3909 B.buildUnmerge(Src0Parts, Src0); 3910 B.buildUnmerge(Src1Parts, Src1); 3911 3912 SmallVector<Register, 2> AccumRegs(NumParts); 3913 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3914 SeparateOddAlignedProducts); 3915 3916 B.buildMergeLikeInstr(DstReg, AccumRegs); 3917 MI.eraseFromParent(); 3918 return true; 3919 } 3920 3921 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3922 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3923 // case with a single min instruction instead of a compare+select. 3924 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3925 MachineRegisterInfo &MRI, 3926 MachineIRBuilder &B) const { 3927 Register Dst = MI.getOperand(0).getReg(); 3928 Register Src = MI.getOperand(1).getReg(); 3929 LLT DstTy = MRI.getType(Dst); 3930 LLT SrcTy = MRI.getType(Src); 3931 3932 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3933 ? AMDGPU::G_AMDGPU_FFBH_U32 3934 : AMDGPU::G_AMDGPU_FFBL_B32; 3935 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3936 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3937 3938 MI.eraseFromParent(); 3939 return true; 3940 } 3941 3942 // Check that this is a G_XOR x, -1 3943 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3944 if (MI.getOpcode() != TargetOpcode::G_XOR) 3945 return false; 3946 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3947 return ConstVal && *ConstVal == -1; 3948 } 3949 3950 // Return the use branch instruction, otherwise null if the usage is invalid. 3951 static MachineInstr * 3952 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3953 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3954 Register CondDef = MI.getOperand(0).getReg(); 3955 if (!MRI.hasOneNonDBGUse(CondDef)) 3956 return nullptr; 3957 3958 MachineBasicBlock *Parent = MI.getParent(); 3959 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3960 3961 if (isNot(MRI, *UseMI)) { 3962 Register NegatedCond = UseMI->getOperand(0).getReg(); 3963 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3964 return nullptr; 3965 3966 // We're deleting the def of this value, so we need to remove it. 3967 eraseInstr(*UseMI, MRI); 3968 3969 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3970 Negated = true; 3971 } 3972 3973 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3974 return nullptr; 3975 3976 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3977 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3978 if (Next == Parent->end()) { 3979 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3980 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3981 return nullptr; 3982 UncondBrTarget = &*NextMBB; 3983 } else { 3984 if (Next->getOpcode() != AMDGPU::G_BR) 3985 return nullptr; 3986 Br = &*Next; 3987 UncondBrTarget = Br->getOperand(0).getMBB(); 3988 } 3989 3990 return UseMI; 3991 } 3992 3993 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3994 const ArgDescriptor *Arg, 3995 const TargetRegisterClass *ArgRC, 3996 LLT ArgTy) const { 3997 MCRegister SrcReg = Arg->getRegister(); 3998 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3999 assert(DstReg.isVirtual() && "Virtual register expected"); 4000 4001 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4002 *ArgRC, B.getDebugLoc(), ArgTy); 4003 if (Arg->isMasked()) { 4004 // TODO: Should we try to emit this once in the entry block? 4005 const LLT S32 = LLT::scalar(32); 4006 const unsigned Mask = Arg->getMask(); 4007 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4008 4009 Register AndMaskSrc = LiveIn; 4010 4011 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4012 // 0. 4013 if (Shift != 0) { 4014 auto ShiftAmt = B.buildConstant(S32, Shift); 4015 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4016 } 4017 4018 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4019 } else { 4020 B.buildCopy(DstReg, LiveIn); 4021 } 4022 4023 return true; 4024 } 4025 4026 bool AMDGPULegalizerInfo::loadInputValue( 4027 Register DstReg, MachineIRBuilder &B, 4028 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4029 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4030 const ArgDescriptor *Arg; 4031 const TargetRegisterClass *ArgRC; 4032 LLT ArgTy; 4033 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4034 4035 if (!Arg) { 4036 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4037 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4038 // case the pointer argument may be missing and we use null. 4039 B.buildConstant(DstReg, 0); 4040 return true; 4041 } 4042 4043 // It's undefined behavior if a function marked with the amdgpu-no-* 4044 // attributes uses the corresponding intrinsic. 4045 B.buildUndef(DstReg); 4046 return true; 4047 } 4048 4049 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4050 return false; // TODO: Handle these 4051 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4052 } 4053 4054 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4055 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4056 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4057 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4058 return false; 4059 4060 MI.eraseFromParent(); 4061 return true; 4062 } 4063 4064 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4065 int64_t C) { 4066 B.buildConstant(MI.getOperand(0).getReg(), C); 4067 MI.eraseFromParent(); 4068 return true; 4069 } 4070 4071 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4072 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4073 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4074 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4075 if (MaxID == 0) 4076 return replaceWithConstant(B, MI, 0); 4077 4078 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4079 const ArgDescriptor *Arg; 4080 const TargetRegisterClass *ArgRC; 4081 LLT ArgTy; 4082 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4083 4084 Register DstReg = MI.getOperand(0).getReg(); 4085 if (!Arg) { 4086 // It's undefined behavior if a function marked with the amdgpu-no-* 4087 // attributes uses the corresponding intrinsic. 4088 B.buildUndef(DstReg); 4089 MI.eraseFromParent(); 4090 return true; 4091 } 4092 4093 if (Arg->isMasked()) { 4094 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4095 // masking operations anyway. 4096 // 4097 // TODO: We could assert the top bit is 0 for the source copy. 4098 if (!loadInputValue(DstReg, B, ArgType)) 4099 return false; 4100 } else { 4101 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4102 if (!loadInputValue(TmpReg, B, ArgType)) 4103 return false; 4104 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4105 } 4106 4107 MI.eraseFromParent(); 4108 return true; 4109 } 4110 4111 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4112 int64_t Offset) const { 4113 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4114 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4115 4116 // TODO: If we passed in the base kernel offset we could have a better 4117 // alignment than 4, but we don't really need it. 4118 if (!loadInputValue(KernArgReg, B, 4119 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4120 llvm_unreachable("failed to find kernarg segment ptr"); 4121 4122 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4123 // TODO: Should get nuw 4124 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4125 } 4126 4127 /// Legalize a value that's loaded from kernel arguments. This is only used by 4128 /// legacy intrinsics. 4129 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4130 MachineIRBuilder &B, 4131 uint64_t Offset, 4132 Align Alignment) const { 4133 Register DstReg = MI.getOperand(0).getReg(); 4134 4135 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4136 "unexpected kernarg parameter type"); 4137 4138 Register Ptr = getKernargParameterPtr(B, Offset); 4139 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4140 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4141 MachineMemOperand::MODereferenceable | 4142 MachineMemOperand::MOInvariant); 4143 MI.eraseFromParent(); 4144 return true; 4145 } 4146 4147 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4148 MachineRegisterInfo &MRI, 4149 MachineIRBuilder &B) const { 4150 Register Dst = MI.getOperand(0).getReg(); 4151 LLT DstTy = MRI.getType(Dst); 4152 LLT S16 = LLT::scalar(16); 4153 LLT S32 = LLT::scalar(32); 4154 LLT S64 = LLT::scalar(64); 4155 4156 if (DstTy == S16) 4157 return legalizeFDIV16(MI, MRI, B); 4158 if (DstTy == S32) 4159 return legalizeFDIV32(MI, MRI, B); 4160 if (DstTy == S64) 4161 return legalizeFDIV64(MI, MRI, B); 4162 4163 return false; 4164 } 4165 4166 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4167 Register DstDivReg, 4168 Register DstRemReg, 4169 Register X, 4170 Register Y) const { 4171 const LLT S1 = LLT::scalar(1); 4172 const LLT S32 = LLT::scalar(32); 4173 4174 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4175 // algorithm used here. 4176 4177 // Initial estimate of inv(y). 4178 auto FloatY = B.buildUITOFP(S32, Y); 4179 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4180 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4181 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4182 auto Z = B.buildFPTOUI(S32, ScaledY); 4183 4184 // One round of UNR. 4185 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4186 auto NegYZ = B.buildMul(S32, NegY, Z); 4187 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4188 4189 // Quotient/remainder estimate. 4190 auto Q = B.buildUMulH(S32, X, Z); 4191 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4192 4193 // First quotient/remainder refinement. 4194 auto One = B.buildConstant(S32, 1); 4195 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4196 if (DstDivReg) 4197 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4198 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4199 4200 // Second quotient/remainder refinement. 4201 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4202 if (DstDivReg) 4203 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4204 4205 if (DstRemReg) 4206 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4207 } 4208 4209 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4210 // 4211 // Return lo, hi of result 4212 // 4213 // %cvt.lo = G_UITOFP Val.lo 4214 // %cvt.hi = G_UITOFP Val.hi 4215 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4216 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4217 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4218 // %mul2 = G_FMUL %mul1, 2**(-32) 4219 // %trunc = G_INTRINSIC_TRUNC %mul2 4220 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4221 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4222 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4223 Register Val) { 4224 const LLT S32 = LLT::scalar(32); 4225 auto Unmerge = B.buildUnmerge(S32, Val); 4226 4227 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4228 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4229 4230 auto Mad = B.buildFMAD( 4231 S32, CvtHi, // 2**32 4232 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4233 4234 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4235 auto Mul1 = B.buildFMul( 4236 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4237 4238 // 2**(-32) 4239 auto Mul2 = B.buildFMul( 4240 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4241 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4242 4243 // -(2**32) 4244 auto Mad2 = B.buildFMAD( 4245 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4246 Mul1); 4247 4248 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4249 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4250 4251 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4252 } 4253 4254 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4255 Register DstDivReg, 4256 Register DstRemReg, 4257 Register Numer, 4258 Register Denom) const { 4259 const LLT S32 = LLT::scalar(32); 4260 const LLT S64 = LLT::scalar(64); 4261 const LLT S1 = LLT::scalar(1); 4262 Register RcpLo, RcpHi; 4263 4264 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4265 4266 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4267 4268 auto Zero64 = B.buildConstant(S64, 0); 4269 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4270 4271 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4272 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4273 4274 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4275 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4276 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4277 4278 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4279 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4280 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4281 4282 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4283 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4284 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4285 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4286 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4287 4288 auto Zero32 = B.buildConstant(S32, 0); 4289 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4290 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4291 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4292 4293 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4294 Register NumerLo = UnmergeNumer.getReg(0); 4295 Register NumerHi = UnmergeNumer.getReg(1); 4296 4297 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4298 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4299 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4300 Register Mul3_Lo = UnmergeMul3.getReg(0); 4301 Register Mul3_Hi = UnmergeMul3.getReg(1); 4302 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4303 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4304 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4305 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4306 4307 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4308 Register DenomLo = UnmergeDenom.getReg(0); 4309 Register DenomHi = UnmergeDenom.getReg(1); 4310 4311 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4312 auto C1 = B.buildSExt(S32, CmpHi); 4313 4314 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4315 auto C2 = B.buildSExt(S32, CmpLo); 4316 4317 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4318 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4319 4320 // TODO: Here and below portions of the code can be enclosed into if/endif. 4321 // Currently control flow is unconditional and we have 4 selects after 4322 // potential endif to substitute PHIs. 4323 4324 // if C3 != 0 ... 4325 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4326 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4327 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4328 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4329 4330 auto One64 = B.buildConstant(S64, 1); 4331 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4332 4333 auto C4 = 4334 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4335 auto C5 = 4336 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4337 auto C6 = B.buildSelect( 4338 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4339 4340 // if (C6 != 0) 4341 auto Add4 = B.buildAdd(S64, Add3, One64); 4342 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4343 4344 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4345 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4346 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4347 4348 // endif C6 4349 // endif C3 4350 4351 if (DstDivReg) { 4352 auto Sel1 = B.buildSelect( 4353 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4354 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4355 Sel1, MulHi3); 4356 } 4357 4358 if (DstRemReg) { 4359 auto Sel2 = B.buildSelect( 4360 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4361 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4362 Sel2, Sub1); 4363 } 4364 } 4365 4366 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4367 MachineRegisterInfo &MRI, 4368 MachineIRBuilder &B) const { 4369 Register DstDivReg, DstRemReg; 4370 switch (MI.getOpcode()) { 4371 default: 4372 llvm_unreachable("Unexpected opcode!"); 4373 case AMDGPU::G_UDIV: { 4374 DstDivReg = MI.getOperand(0).getReg(); 4375 break; 4376 } 4377 case AMDGPU::G_UREM: { 4378 DstRemReg = MI.getOperand(0).getReg(); 4379 break; 4380 } 4381 case AMDGPU::G_UDIVREM: { 4382 DstDivReg = MI.getOperand(0).getReg(); 4383 DstRemReg = MI.getOperand(1).getReg(); 4384 break; 4385 } 4386 } 4387 4388 const LLT S64 = LLT::scalar(64); 4389 const LLT S32 = LLT::scalar(32); 4390 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4391 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4392 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4393 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4394 4395 if (Ty == S32) 4396 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4397 else if (Ty == S64) 4398 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4399 else 4400 return false; 4401 4402 MI.eraseFromParent(); 4403 return true; 4404 } 4405 4406 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4407 MachineRegisterInfo &MRI, 4408 MachineIRBuilder &B) const { 4409 const LLT S64 = LLT::scalar(64); 4410 const LLT S32 = LLT::scalar(32); 4411 4412 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4413 if (Ty != S32 && Ty != S64) 4414 return false; 4415 4416 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4417 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4418 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4419 4420 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4421 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4422 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4423 4424 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4425 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4426 4427 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4428 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4429 4430 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4431 switch (MI.getOpcode()) { 4432 default: 4433 llvm_unreachable("Unexpected opcode!"); 4434 case AMDGPU::G_SDIV: { 4435 DstDivReg = MI.getOperand(0).getReg(); 4436 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4437 break; 4438 } 4439 case AMDGPU::G_SREM: { 4440 DstRemReg = MI.getOperand(0).getReg(); 4441 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4442 break; 4443 } 4444 case AMDGPU::G_SDIVREM: { 4445 DstDivReg = MI.getOperand(0).getReg(); 4446 DstRemReg = MI.getOperand(1).getReg(); 4447 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4448 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4449 break; 4450 } 4451 } 4452 4453 if (Ty == S32) 4454 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4455 else 4456 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4457 4458 if (DstDivReg) { 4459 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4460 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4461 B.buildSub(DstDivReg, SignXor, Sign); 4462 } 4463 4464 if (DstRemReg) { 4465 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4466 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4467 B.buildSub(DstRemReg, SignXor, Sign); 4468 } 4469 4470 MI.eraseFromParent(); 4471 return true; 4472 } 4473 4474 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4475 MachineRegisterInfo &MRI, 4476 MachineIRBuilder &B) const { 4477 Register Res = MI.getOperand(0).getReg(); 4478 Register LHS = MI.getOperand(1).getReg(); 4479 Register RHS = MI.getOperand(2).getReg(); 4480 uint16_t Flags = MI.getFlags(); 4481 LLT ResTy = MRI.getType(Res); 4482 4483 const MachineFunction &MF = B.getMF(); 4484 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4485 MF.getTarget().Options.UnsafeFPMath; 4486 4487 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 4488 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4489 return false; 4490 4491 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4492 // the CI documentation has a worst case error of 1 ulp. 4493 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4494 // use it as long as we aren't trying to use denormals. 4495 // 4496 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4497 4498 // 1 / x -> RCP(x) 4499 if (CLHS->isExactlyValue(1.0)) { 4500 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 4501 .addUse(RHS) 4502 .setMIFlags(Flags); 4503 4504 MI.eraseFromParent(); 4505 return true; 4506 } 4507 4508 // TODO: Match rsq 4509 4510 // -1 / x -> RCP( FNEG(x) ) 4511 if (CLHS->isExactlyValue(-1.0)) { 4512 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4513 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 4514 .addUse(FNeg.getReg(0)) 4515 .setMIFlags(Flags); 4516 4517 MI.eraseFromParent(); 4518 return true; 4519 } 4520 } 4521 4522 // For f16 require arcp only. 4523 // For f32 require afn+arcp. 4524 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4525 !MI.getFlag(MachineInstr::FmArcp))) 4526 return false; 4527 4528 // x / y -> x * (1.0 / y) 4529 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 4530 .addUse(RHS) 4531 .setMIFlags(Flags); 4532 B.buildFMul(Res, LHS, RCP, Flags); 4533 4534 MI.eraseFromParent(); 4535 return true; 4536 } 4537 4538 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4539 MachineRegisterInfo &MRI, 4540 MachineIRBuilder &B) const { 4541 Register Res = MI.getOperand(0).getReg(); 4542 Register X = MI.getOperand(1).getReg(); 4543 Register Y = MI.getOperand(2).getReg(); 4544 uint16_t Flags = MI.getFlags(); 4545 LLT ResTy = MRI.getType(Res); 4546 4547 const MachineFunction &MF = B.getMF(); 4548 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4549 MI.getFlag(MachineInstr::FmAfn); 4550 4551 if (!AllowInaccurateRcp) 4552 return false; 4553 4554 auto NegY = B.buildFNeg(ResTy, Y); 4555 auto One = B.buildFConstant(ResTy, 1.0); 4556 4557 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 4558 .addUse(Y) 4559 .setMIFlags(Flags); 4560 4561 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4562 R = B.buildFMA(ResTy, Tmp0, R, R); 4563 4564 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4565 R = B.buildFMA(ResTy, Tmp1, R, R); 4566 4567 auto Ret = B.buildFMul(ResTy, X, R); 4568 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4569 4570 B.buildFMA(Res, Tmp2, R, Ret); 4571 MI.eraseFromParent(); 4572 return true; 4573 } 4574 4575 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4576 MachineRegisterInfo &MRI, 4577 MachineIRBuilder &B) const { 4578 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4579 return true; 4580 4581 Register Res = MI.getOperand(0).getReg(); 4582 Register LHS = MI.getOperand(1).getReg(); 4583 Register RHS = MI.getOperand(2).getReg(); 4584 4585 uint16_t Flags = MI.getFlags(); 4586 4587 LLT S16 = LLT::scalar(16); 4588 LLT S32 = LLT::scalar(32); 4589 4590 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4591 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4592 4593 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4594 .addUse(RHSExt.getReg(0)) 4595 .setMIFlags(Flags); 4596 4597 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4598 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4599 4600 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4601 .addUse(RDst.getReg(0)) 4602 .addUse(RHS) 4603 .addUse(LHS) 4604 .setMIFlags(Flags); 4605 4606 MI.eraseFromParent(); 4607 return true; 4608 } 4609 4610 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4611 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4612 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4613 const GCNSubtarget &ST, 4614 SIModeRegisterDefaults Mode) { 4615 // Set SP denorm mode to this value. 4616 unsigned SPDenormMode = 4617 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4618 4619 if (ST.hasDenormModeInst()) { 4620 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 4621 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4622 4623 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4624 B.buildInstr(AMDGPU::S_DENORM_MODE) 4625 .addImm(NewDenormModeValue); 4626 4627 } else { 4628 // Select FP32 bit field in mode register. 4629 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 4630 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 4631 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 4632 4633 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4634 .addImm(SPDenormMode) 4635 .addImm(SPDenormModeBitField); 4636 } 4637 } 4638 4639 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4640 MachineRegisterInfo &MRI, 4641 MachineIRBuilder &B) const { 4642 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4643 return true; 4644 4645 Register Res = MI.getOperand(0).getReg(); 4646 Register LHS = MI.getOperand(1).getReg(); 4647 Register RHS = MI.getOperand(2).getReg(); 4648 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4649 SIModeRegisterDefaults Mode = MFI->getMode(); 4650 4651 uint16_t Flags = MI.getFlags(); 4652 4653 LLT S32 = LLT::scalar(32); 4654 LLT S1 = LLT::scalar(1); 4655 4656 auto One = B.buildFConstant(S32, 1.0f); 4657 4658 auto DenominatorScaled = 4659 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4660 .addUse(LHS) 4661 .addUse(RHS) 4662 .addImm(0) 4663 .setMIFlags(Flags); 4664 auto NumeratorScaled = 4665 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4666 .addUse(LHS) 4667 .addUse(RHS) 4668 .addImm(1) 4669 .setMIFlags(Flags); 4670 4671 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4672 .addUse(DenominatorScaled.getReg(0)) 4673 .setMIFlags(Flags); 4674 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4675 4676 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 4677 // aren't modeled as reading it. 4678 if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4679 toggleSPDenormMode(true, B, ST, Mode); 4680 4681 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4682 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4683 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4684 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4685 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4686 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4687 4688 // FIXME: This mishandles dynamic denormal mode. We need to query the 4689 // current mode and restore the original. 4690 if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4691 toggleSPDenormMode(false, B, ST, Mode); 4692 4693 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 4694 .addUse(Fma4.getReg(0)) 4695 .addUse(Fma1.getReg(0)) 4696 .addUse(Fma3.getReg(0)) 4697 .addUse(NumeratorScaled.getReg(1)) 4698 .setMIFlags(Flags); 4699 4700 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4701 .addUse(Fmas.getReg(0)) 4702 .addUse(RHS) 4703 .addUse(LHS) 4704 .setMIFlags(Flags); 4705 4706 MI.eraseFromParent(); 4707 return true; 4708 } 4709 4710 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4711 MachineRegisterInfo &MRI, 4712 MachineIRBuilder &B) const { 4713 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4714 return true; 4715 4716 Register Res = MI.getOperand(0).getReg(); 4717 Register LHS = MI.getOperand(1).getReg(); 4718 Register RHS = MI.getOperand(2).getReg(); 4719 4720 uint16_t Flags = MI.getFlags(); 4721 4722 LLT S64 = LLT::scalar(64); 4723 LLT S1 = LLT::scalar(1); 4724 4725 auto One = B.buildFConstant(S64, 1.0); 4726 4727 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4728 .addUse(LHS) 4729 .addUse(RHS) 4730 .addImm(0) 4731 .setMIFlags(Flags); 4732 4733 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4734 4735 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 4736 .addUse(DivScale0.getReg(0)) 4737 .setMIFlags(Flags); 4738 4739 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4740 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4741 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4742 4743 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4744 .addUse(LHS) 4745 .addUse(RHS) 4746 .addImm(1) 4747 .setMIFlags(Flags); 4748 4749 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4750 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4751 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4752 4753 Register Scale; 4754 if (!ST.hasUsableDivScaleConditionOutput()) { 4755 // Workaround a hardware bug on SI where the condition output from div_scale 4756 // is not usable. 4757 4758 LLT S32 = LLT::scalar(32); 4759 4760 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4761 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4762 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4763 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4764 4765 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4766 Scale1Unmerge.getReg(1)); 4767 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4768 Scale0Unmerge.getReg(1)); 4769 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4770 } else { 4771 Scale = DivScale1.getReg(1); 4772 } 4773 4774 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4775 .addUse(Fma4.getReg(0)) 4776 .addUse(Fma3.getReg(0)) 4777 .addUse(Mul.getReg(0)) 4778 .addUse(Scale) 4779 .setMIFlags(Flags); 4780 4781 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) 4782 .addUse(Fmas.getReg(0)) 4783 .addUse(RHS) 4784 .addUse(LHS) 4785 .setMIFlags(Flags); 4786 4787 MI.eraseFromParent(); 4788 return true; 4789 } 4790 4791 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 4792 MachineRegisterInfo &MRI, 4793 MachineIRBuilder &B) const { 4794 Register Res0 = MI.getOperand(0).getReg(); 4795 Register Res1 = MI.getOperand(1).getReg(); 4796 Register Val = MI.getOperand(2).getReg(); 4797 uint16_t Flags = MI.getFlags(); 4798 4799 LLT Ty = MRI.getType(Res0); 4800 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 4801 4802 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false) 4803 .addUse(Val) 4804 .setMIFlags(Flags); 4805 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false) 4806 .addUse(Val) 4807 .setMIFlags(Flags); 4808 4809 if (ST.hasFractBug()) { 4810 auto Fabs = B.buildFAbs(Ty, Val); 4811 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 4812 auto IsFinite = 4813 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 4814 auto Zero = B.buildConstant(InstrExpTy, 0); 4815 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 4816 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 4817 } 4818 4819 B.buildCopy(Res0, Mant); 4820 B.buildSExtOrTrunc(Res1, Exp); 4821 4822 MI.eraseFromParent(); 4823 return true; 4824 } 4825 4826 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4827 MachineRegisterInfo &MRI, 4828 MachineIRBuilder &B) const { 4829 Register Res = MI.getOperand(0).getReg(); 4830 Register LHS = MI.getOperand(2).getReg(); 4831 Register RHS = MI.getOperand(3).getReg(); 4832 uint16_t Flags = MI.getFlags(); 4833 4834 LLT S32 = LLT::scalar(32); 4835 LLT S1 = LLT::scalar(1); 4836 4837 auto Abs = B.buildFAbs(S32, RHS, Flags); 4838 const APFloat C0Val(1.0f); 4839 4840 auto C0 = B.buildFConstant(S32, 0x1p+96f); 4841 auto C1 = B.buildFConstant(S32, 0x1p-32f); 4842 auto C2 = B.buildFConstant(S32, 1.0f); 4843 4844 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4845 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4846 4847 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4848 4849 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4850 .addUse(Mul0.getReg(0)) 4851 .setMIFlags(Flags); 4852 4853 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4854 4855 B.buildFMul(Res, Sel, Mul1, Flags); 4856 4857 MI.eraseFromParent(); 4858 return true; 4859 } 4860 4861 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 4862 MachineRegisterInfo &MRI, 4863 MachineIRBuilder &B) const { 4864 // For double type, the SQRT and RSQ instructions don't have required 4865 // precision, we apply Goldschmidt's algorithm to improve the result: 4866 // 4867 // y0 = rsq(x) 4868 // g0 = x * y0 4869 // h0 = 0.5 * y0 4870 // 4871 // r0 = 0.5 - h0 * g0 4872 // g1 = g0 * r0 + g0 4873 // h1 = h0 * r0 + h0 4874 // 4875 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 4876 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 4877 // h2 = h1 * r1 + h1 4878 // 4879 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 4880 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 4881 // 4882 // sqrt(x) = g3 4883 4884 const LLT S1 = LLT::scalar(1); 4885 const LLT S32 = LLT::scalar(32); 4886 const LLT F64 = LLT::scalar(64); 4887 4888 Register Dst = MI.getOperand(0).getReg(); 4889 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 4890 4891 Register X = MI.getOperand(1).getReg(); 4892 unsigned Flags = MI.getFlags(); 4893 4894 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 4895 4896 auto ZeroInt = B.buildConstant(S32, 0); 4897 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 4898 4899 // Scale up input if it is too small. 4900 auto ScaleUpFactor = B.buildConstant(S32, 256); 4901 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 4902 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 4903 4904 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) 4905 .addReg(SqrtX.getReg(0)); 4906 4907 auto Half = B.buildFConstant(F64, 0.5); 4908 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 4909 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 4910 4911 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 4912 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 4913 4914 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 4915 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 4916 4917 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 4918 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 4919 4920 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 4921 4922 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 4923 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 4924 4925 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 4926 4927 // Scale down the result. 4928 auto ScaleDownFactor = B.buildConstant(S32, -128); 4929 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 4930 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 4931 4932 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 4933 // with finite only or nsz because rsq(+/-0) = +/-inf 4934 4935 // TODO: Check for DAZ and expand to subnormals 4936 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 4937 4938 // If x is +INF, +0, or -0, use its original value 4939 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 4940 4941 MI.eraseFromParent(); 4942 return true; 4943 } 4944 4945 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4946 // FIXME: Why do we handle this one but not other removed instructions? 4947 // 4948 // Reciprocal square root. The clamp prevents infinite results, clamping 4949 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4950 // +-max_float. 4951 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4952 MachineRegisterInfo &MRI, 4953 MachineIRBuilder &B) const { 4954 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4955 return true; 4956 4957 Register Dst = MI.getOperand(0).getReg(); 4958 Register Src = MI.getOperand(2).getReg(); 4959 auto Flags = MI.getFlags(); 4960 4961 LLT Ty = MRI.getType(Dst); 4962 4963 const fltSemantics *FltSemantics; 4964 if (Ty == LLT::scalar(32)) 4965 FltSemantics = &APFloat::IEEEsingle(); 4966 else if (Ty == LLT::scalar(64)) 4967 FltSemantics = &APFloat::IEEEdouble(); 4968 else 4969 return false; 4970 4971 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4972 .addUse(Src) 4973 .setMIFlags(Flags); 4974 4975 // We don't need to concern ourselves with the snan handling difference, since 4976 // the rsq quieted (or not) so use the one which will directly select. 4977 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4978 const bool UseIEEE = MFI->getMode().IEEE; 4979 4980 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4981 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4982 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4983 4984 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4985 4986 if (UseIEEE) 4987 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4988 else 4989 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4990 MI.eraseFromParent(); 4991 return true; 4992 } 4993 4994 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4995 switch (IID) { 4996 case Intrinsic::amdgcn_ds_fadd: 4997 return AMDGPU::G_ATOMICRMW_FADD; 4998 case Intrinsic::amdgcn_ds_fmin: 4999 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 5000 case Intrinsic::amdgcn_ds_fmax: 5001 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 5002 default: 5003 llvm_unreachable("not a DS FP intrinsic"); 5004 } 5005 } 5006 5007 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 5008 MachineInstr &MI, 5009 Intrinsic::ID IID) const { 5010 GISelChangeObserver &Observer = Helper.Observer; 5011 Observer.changingInstr(MI); 5012 5013 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 5014 5015 // The remaining operands were used to set fields in the MemOperand on 5016 // construction. 5017 for (int I = 6; I > 3; --I) 5018 MI.removeOperand(I); 5019 5020 MI.removeOperand(1); // Remove the intrinsic ID. 5021 Observer.changedInstr(MI); 5022 return true; 5023 } 5024 5025 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5026 MachineRegisterInfo &MRI, 5027 MachineIRBuilder &B) const { 5028 uint64_t Offset = 5029 ST.getTargetLowering()->getImplicitParameterOffset( 5030 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5031 LLT DstTy = MRI.getType(DstReg); 5032 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5033 5034 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5035 if (!loadInputValue(KernargPtrReg, B, 5036 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5037 return false; 5038 5039 // FIXME: This should be nuw 5040 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5041 return true; 5042 } 5043 5044 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5045 /// bits of the pointer and replace them with the stride argument, then 5046 /// merge_values everything together. In the common case of a raw buffer (the 5047 /// stride component is 0), we can just AND off the upper half. 5048 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5049 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5050 Register Result = MI.getOperand(0).getReg(); 5051 Register Pointer = MI.getOperand(2).getReg(); 5052 Register Stride = MI.getOperand(3).getReg(); 5053 Register NumRecords = MI.getOperand(4).getReg(); 5054 Register Flags = MI.getOperand(5).getReg(); 5055 5056 LLT S32 = LLT::scalar(32); 5057 5058 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5059 auto Unmerge = B.buildUnmerge(S32, Pointer); 5060 Register LowHalf = Unmerge.getReg(0); 5061 Register HighHalf = Unmerge.getReg(1); 5062 5063 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5064 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5065 5066 MachineInstrBuilder NewHighHalf = Masked; 5067 std::optional<ValueAndVReg> StrideConst = 5068 getIConstantVRegValWithLookThrough(Stride, MRI); 5069 if (!StrideConst || !StrideConst->Value.isZero()) { 5070 MachineInstrBuilder ShiftedStride; 5071 if (StrideConst) { 5072 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5073 uint32_t ShiftedStrideVal = StrideVal << 16; 5074 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5075 } else { 5076 auto ExtStride = B.buildAnyExt(S32, Stride); 5077 auto ShiftConst = B.buildConstant(S32, 16); 5078 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5079 } 5080 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5081 } 5082 Register NewHighHalfReg = NewHighHalf.getReg(0); 5083 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5084 MI.eraseFromParent(); 5085 return true; 5086 } 5087 5088 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5089 MachineRegisterInfo &MRI, 5090 MachineIRBuilder &B) const { 5091 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5092 if (!MFI->isEntryFunction()) { 5093 return legalizePreloadedArgIntrin(MI, MRI, B, 5094 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5095 } 5096 5097 Register DstReg = MI.getOperand(0).getReg(); 5098 if (!getImplicitArgPtr(DstReg, MRI, B)) 5099 return false; 5100 5101 MI.eraseFromParent(); 5102 return true; 5103 } 5104 5105 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5106 MachineRegisterInfo &MRI, 5107 MachineIRBuilder &B) const { 5108 Function &F = B.getMF().getFunction(); 5109 std::optional<uint32_t> KnownSize = 5110 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5111 if (KnownSize.has_value()) 5112 B.buildConstant(DstReg, *KnownSize); 5113 return false; 5114 } 5115 5116 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5117 MachineRegisterInfo &MRI, 5118 MachineIRBuilder &B) const { 5119 5120 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5121 if (!MFI->isEntryFunction()) { 5122 return legalizePreloadedArgIntrin(MI, MRI, B, 5123 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5124 } 5125 5126 Register DstReg = MI.getOperand(0).getReg(); 5127 if (!getLDSKernelId(DstReg, MRI, B)) 5128 return false; 5129 5130 MI.eraseFromParent(); 5131 return true; 5132 } 5133 5134 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5135 MachineRegisterInfo &MRI, 5136 MachineIRBuilder &B, 5137 unsigned AddrSpace) const { 5138 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5139 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5140 Register Hi32 = Unmerge.getReg(1); 5141 5142 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5143 MI.eraseFromParent(); 5144 return true; 5145 } 5146 5147 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5148 // offset (the offset that is included in bounds checking and swizzling, to be 5149 // split between the instruction's voffset and immoffset fields) and soffset 5150 // (the offset that is excluded from bounds checking and swizzling, to go in 5151 // the instruction's soffset field). This function takes the first kind of 5152 // offset and figures out how to split it between voffset and immoffset. 5153 std::pair<Register, unsigned> 5154 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5155 Register OrigOffset) const { 5156 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); 5157 Register BaseReg; 5158 unsigned ImmOffset; 5159 const LLT S32 = LLT::scalar(32); 5160 MachineRegisterInfo &MRI = *B.getMRI(); 5161 5162 std::tie(BaseReg, ImmOffset) = 5163 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5164 5165 // If BaseReg is a pointer, convert it to int. 5166 if (MRI.getType(BaseReg).isPointer()) 5167 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5168 5169 // If the immediate value is too big for the immoffset field, put only bits 5170 // that would normally fit in the immoffset field. The remaining value that 5171 // is copied/added for the voffset field is a large power of 2, and it 5172 // stands more chance of being CSEd with the copy/add for another similar 5173 // load/store. 5174 // However, do not do that rounding down if that is a negative 5175 // number, as it appears to be illegal to have a negative offset in the 5176 // vgpr, even if adding the immediate offset makes it positive. 5177 unsigned Overflow = ImmOffset & ~MaxImm; 5178 ImmOffset -= Overflow; 5179 if ((int32_t)Overflow < 0) { 5180 Overflow += ImmOffset; 5181 ImmOffset = 0; 5182 } 5183 5184 if (Overflow != 0) { 5185 if (!BaseReg) { 5186 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5187 } else { 5188 auto OverflowVal = B.buildConstant(S32, Overflow); 5189 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5190 } 5191 } 5192 5193 if (!BaseReg) 5194 BaseReg = B.buildConstant(S32, 0).getReg(0); 5195 5196 return std::pair(BaseReg, ImmOffset); 5197 } 5198 5199 /// Handle register layout difference for f16 images for some subtargets. 5200 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5201 MachineRegisterInfo &MRI, 5202 Register Reg, 5203 bool ImageStore) const { 5204 const LLT S16 = LLT::scalar(16); 5205 const LLT S32 = LLT::scalar(32); 5206 LLT StoreVT = MRI.getType(Reg); 5207 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5208 5209 if (ST.hasUnpackedD16VMem()) { 5210 auto Unmerge = B.buildUnmerge(S16, Reg); 5211 5212 SmallVector<Register, 4> WideRegs; 5213 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5214 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5215 5216 int NumElts = StoreVT.getNumElements(); 5217 5218 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5219 .getReg(0); 5220 } 5221 5222 if (ImageStore && ST.hasImageStoreD16Bug()) { 5223 if (StoreVT.getNumElements() == 2) { 5224 SmallVector<Register, 4> PackedRegs; 5225 Reg = B.buildBitcast(S32, Reg).getReg(0); 5226 PackedRegs.push_back(Reg); 5227 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5228 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5229 .getReg(0); 5230 } 5231 5232 if (StoreVT.getNumElements() == 3) { 5233 SmallVector<Register, 4> PackedRegs; 5234 auto Unmerge = B.buildUnmerge(S16, Reg); 5235 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5236 PackedRegs.push_back(Unmerge.getReg(I)); 5237 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5238 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5239 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5240 } 5241 5242 if (StoreVT.getNumElements() == 4) { 5243 SmallVector<Register, 4> PackedRegs; 5244 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5245 auto Unmerge = B.buildUnmerge(S32, Reg); 5246 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5247 PackedRegs.push_back(Unmerge.getReg(I)); 5248 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5249 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5250 .getReg(0); 5251 } 5252 5253 llvm_unreachable("invalid data type"); 5254 } 5255 5256 if (StoreVT == LLT::fixed_vector(3, S16)) { 5257 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5258 .getReg(0); 5259 } 5260 return Reg; 5261 } 5262 5263 Register AMDGPULegalizerInfo::fixStoreSourceType( 5264 MachineIRBuilder &B, Register VData, bool IsFormat) const { 5265 MachineRegisterInfo *MRI = B.getMRI(); 5266 LLT Ty = MRI->getType(VData); 5267 5268 const LLT S16 = LLT::scalar(16); 5269 5270 // Fixup buffer resources themselves needing to be v4i128. 5271 if (hasBufferRsrcWorkaround(Ty)) 5272 return castBufferRsrcToV4I32(VData, B); 5273 5274 // Fixup illegal register types for i8 stores. 5275 if (Ty == LLT::scalar(8) || Ty == S16) { 5276 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5277 return AnyExt; 5278 } 5279 5280 if (Ty.isVector()) { 5281 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5282 if (IsFormat) 5283 return handleD16VData(B, *MRI, VData); 5284 } 5285 } 5286 5287 return VData; 5288 } 5289 5290 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5291 MachineRegisterInfo &MRI, 5292 MachineIRBuilder &B, 5293 bool IsTyped, 5294 bool IsFormat) const { 5295 Register VData = MI.getOperand(1).getReg(); 5296 LLT Ty = MRI.getType(VData); 5297 LLT EltTy = Ty.getScalarType(); 5298 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5299 const LLT S32 = LLT::scalar(32); 5300 5301 VData = fixStoreSourceType(B, VData, IsFormat); 5302 castBufferRsrcArgToV4I32(MI, B, 2); 5303 Register RSrc = MI.getOperand(2).getReg(); 5304 5305 MachineMemOperand *MMO = *MI.memoperands_begin(); 5306 const int MemSize = MMO->getSize(); 5307 5308 unsigned ImmOffset; 5309 5310 // The typed intrinsics add an immediate after the registers. 5311 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5312 5313 // The struct intrinsic variants add one additional operand over raw. 5314 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5315 Register VIndex; 5316 int OpOffset = 0; 5317 if (HasVIndex) { 5318 VIndex = MI.getOperand(3).getReg(); 5319 OpOffset = 1; 5320 } else { 5321 VIndex = B.buildConstant(S32, 0).getReg(0); 5322 } 5323 5324 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5325 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5326 5327 unsigned Format = 0; 5328 if (IsTyped) { 5329 Format = MI.getOperand(5 + OpOffset).getImm(); 5330 ++OpOffset; 5331 } 5332 5333 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5334 5335 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5336 5337 unsigned Opc; 5338 if (IsTyped) { 5339 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5340 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5341 } else if (IsFormat) { 5342 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5343 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5344 } else { 5345 switch (MemSize) { 5346 case 1: 5347 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5348 break; 5349 case 2: 5350 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5351 break; 5352 default: 5353 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5354 break; 5355 } 5356 } 5357 5358 auto MIB = B.buildInstr(Opc) 5359 .addUse(VData) // vdata 5360 .addUse(RSrc) // rsrc 5361 .addUse(VIndex) // vindex 5362 .addUse(VOffset) // voffset 5363 .addUse(SOffset) // soffset 5364 .addImm(ImmOffset); // offset(imm) 5365 5366 if (IsTyped) 5367 MIB.addImm(Format); 5368 5369 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5370 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5371 .addMemOperand(MMO); 5372 5373 MI.eraseFromParent(); 5374 return true; 5375 } 5376 5377 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5378 Register VIndex, Register VOffset, Register SOffset, 5379 unsigned ImmOffset, unsigned Format, 5380 unsigned AuxiliaryData, MachineMemOperand *MMO, 5381 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5382 auto MIB = B.buildInstr(Opc) 5383 .addDef(LoadDstReg) // vdata 5384 .addUse(RSrc) // rsrc 5385 .addUse(VIndex) // vindex 5386 .addUse(VOffset) // voffset 5387 .addUse(SOffset) // soffset 5388 .addImm(ImmOffset); // offset(imm) 5389 5390 if (IsTyped) 5391 MIB.addImm(Format); 5392 5393 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5394 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5395 .addMemOperand(MMO); 5396 } 5397 5398 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 5399 MachineRegisterInfo &MRI, 5400 MachineIRBuilder &B, 5401 bool IsFormat, 5402 bool IsTyped) const { 5403 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 5404 MachineMemOperand *MMO = *MI.memoperands_begin(); 5405 const LLT MemTy = MMO->getMemoryType(); 5406 const LLT S32 = LLT::scalar(32); 5407 5408 Register Dst = MI.getOperand(0).getReg(); 5409 5410 Register StatusDst; 5411 int OpOffset = 0; 5412 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5413 bool IsTFE = MI.getNumExplicitDefs() == 2; 5414 if (IsTFE) { 5415 StatusDst = MI.getOperand(1).getReg(); 5416 ++OpOffset; 5417 } 5418 5419 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5420 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 5421 5422 // The typed intrinsics add an immediate after the registers. 5423 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5424 5425 // The struct intrinsic variants add one additional operand over raw. 5426 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 5427 Register VIndex; 5428 if (HasVIndex) { 5429 VIndex = MI.getOperand(3 + OpOffset).getReg(); 5430 ++OpOffset; 5431 } else { 5432 VIndex = B.buildConstant(S32, 0).getReg(0); 5433 } 5434 5435 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5436 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5437 5438 unsigned Format = 0; 5439 if (IsTyped) { 5440 Format = MI.getOperand(5 + OpOffset).getImm(); 5441 ++OpOffset; 5442 } 5443 5444 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5445 unsigned ImmOffset; 5446 5447 LLT Ty = MRI.getType(Dst); 5448 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 5449 // logic doesn't have to handle that case. 5450 if (hasBufferRsrcWorkaround(Ty)) { 5451 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 5452 Dst = MI.getOperand(0).getReg(); 5453 } 5454 LLT EltTy = Ty.getScalarType(); 5455 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5456 const bool Unpacked = ST.hasUnpackedD16VMem(); 5457 5458 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5459 5460 unsigned Opc; 5461 5462 // TODO: Support TFE for typed and narrow loads. 5463 if (IsTyped) { 5464 if (IsTFE) 5465 return false; 5466 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 5467 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 5468 } else if (IsFormat) { 5469 if (IsD16) { 5470 if (IsTFE) 5471 return false; 5472 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 5473 } else { 5474 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5475 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5476 } 5477 } else { 5478 if (IsTFE) 5479 return false; 5480 switch (MemTy.getSizeInBits()) { 5481 case 8: 5482 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 5483 break; 5484 case 16: 5485 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 5486 break; 5487 default: 5488 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 5489 break; 5490 } 5491 } 5492 5493 if (IsTFE) { 5494 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5495 unsigned NumLoadDWords = NumValueDWords + 1; 5496 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5497 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5498 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5499 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5500 if (NumValueDWords == 1) { 5501 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5502 } else { 5503 SmallVector<Register, 5> LoadElts; 5504 for (unsigned I = 0; I != NumValueDWords; ++I) 5505 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5506 LoadElts.push_back(StatusDst); 5507 B.buildUnmerge(LoadElts, LoadDstReg); 5508 LoadElts.truncate(NumValueDWords); 5509 B.buildMergeLikeInstr(Dst, LoadElts); 5510 } 5511 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5512 (IsD16 && !Ty.isVector())) { 5513 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5514 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5515 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5516 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5517 B.buildTrunc(Dst, LoadDstReg); 5518 } else if (Unpacked && IsD16 && Ty.isVector()) { 5519 LLT UnpackedTy = Ty.changeElementSize(32); 5520 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5521 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5522 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5523 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5524 // FIXME: G_TRUNC should work, but legalization currently fails 5525 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 5526 SmallVector<Register, 4> Repack; 5527 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 5528 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5529 B.buildMergeLikeInstr(Dst, Repack); 5530 } else { 5531 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5532 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5533 } 5534 5535 MI.eraseFromParent(); 5536 return true; 5537 } 5538 5539 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 5540 switch (IntrID) { 5541 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5542 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 5543 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5544 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 5545 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 5546 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5547 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 5548 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5549 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 5550 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 5551 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5552 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 5553 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5554 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 5555 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 5556 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5557 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 5558 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5559 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 5560 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 5561 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5562 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 5563 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5564 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 5565 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 5566 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5567 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 5568 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5569 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 5570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 5571 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 5573 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5574 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 5575 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 5576 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 5578 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5579 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 5580 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 5581 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 5583 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 5585 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 5586 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 5588 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 5590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 5591 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 5593 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 5595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 5596 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 5598 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 5600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 5601 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 5603 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 5605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5606 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5608 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 5611 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5613 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5616 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5618 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 5621 default: 5622 llvm_unreachable("unhandled atomic opcode"); 5623 } 5624 } 5625 5626 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 5627 MachineIRBuilder &B, 5628 Intrinsic::ID IID) const { 5629 const bool IsCmpSwap = 5630 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 5631 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 5632 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 5633 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 5634 const bool HasReturn = MI.getNumExplicitDefs() != 0; 5635 5636 Register Dst; 5637 5638 int OpOffset = 0; 5639 if (HasReturn) { 5640 // A few FP atomics do not support return values. 5641 Dst = MI.getOperand(0).getReg(); 5642 } else { 5643 OpOffset = -1; 5644 } 5645 5646 // Since we don't have 128-bit atomics, we don't need to handle the case of 5647 // p8 argmunents to the atomic itself 5648 Register VData = MI.getOperand(2 + OpOffset).getReg(); 5649 Register CmpVal; 5650 5651 if (IsCmpSwap) { 5652 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 5653 ++OpOffset; 5654 } 5655 5656 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 5657 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 5658 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 5659 5660 // The struct intrinsic variants add one additional operand over raw. 5661 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5662 Register VIndex; 5663 if (HasVIndex) { 5664 VIndex = MI.getOperand(4 + OpOffset).getReg(); 5665 ++OpOffset; 5666 } else { 5667 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 5668 } 5669 5670 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 5671 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 5672 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 5673 5674 MachineMemOperand *MMO = *MI.memoperands_begin(); 5675 5676 unsigned ImmOffset; 5677 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5678 5679 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 5680 5681 if (HasReturn) 5682 MIB.addDef(Dst); 5683 5684 MIB.addUse(VData); // vdata 5685 5686 if (IsCmpSwap) 5687 MIB.addReg(CmpVal); 5688 5689 MIB.addUse(RSrc) // rsrc 5690 .addUse(VIndex) // vindex 5691 .addUse(VOffset) // voffset 5692 .addUse(SOffset) // soffset 5693 .addImm(ImmOffset) // offset(imm) 5694 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5695 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5696 .addMemOperand(MMO); 5697 5698 MI.eraseFromParent(); 5699 return true; 5700 } 5701 5702 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 5703 /// vector with s16 typed elements. 5704 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 5705 SmallVectorImpl<Register> &PackedAddrs, 5706 unsigned ArgOffset, 5707 const AMDGPU::ImageDimIntrinsicInfo *Intr, 5708 bool IsA16, bool IsG16) { 5709 const LLT S16 = LLT::scalar(16); 5710 const LLT V2S16 = LLT::fixed_vector(2, 16); 5711 auto EndIdx = Intr->VAddrEnd; 5712 5713 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 5714 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 5715 if (!SrcOp.isReg()) 5716 continue; // _L to _LZ may have eliminated this. 5717 5718 Register AddrReg = SrcOp.getReg(); 5719 5720 if ((I < Intr->GradientStart) || 5721 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 5722 (I >= Intr->CoordStart && !IsA16)) { 5723 if ((I < Intr->GradientStart) && IsA16 && 5724 (B.getMRI()->getType(AddrReg) == S16)) { 5725 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 5726 // Special handling of bias when A16 is on. Bias is of type half but 5727 // occupies full 32-bit. 5728 PackedAddrs.push_back( 5729 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 5730 .getReg(0)); 5731 } else { 5732 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 5733 "Bias needs to be converted to 16 bit in A16 mode"); 5734 // Handle any gradient or coordinate operands that should not be packed 5735 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 5736 PackedAddrs.push_back(AddrReg); 5737 } 5738 } else { 5739 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 5740 // derivatives dx/dh and dx/dv are packed with undef. 5741 if (((I + 1) >= EndIdx) || 5742 ((Intr->NumGradients / 2) % 2 == 1 && 5743 (I == static_cast<unsigned>(Intr->GradientStart + 5744 (Intr->NumGradients / 2) - 1) || 5745 I == static_cast<unsigned>(Intr->GradientStart + 5746 Intr->NumGradients - 1))) || 5747 // Check for _L to _LZ optimization 5748 !MI.getOperand(ArgOffset + I + 1).isReg()) { 5749 PackedAddrs.push_back( 5750 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 5751 .getReg(0)); 5752 } else { 5753 PackedAddrs.push_back( 5754 B.buildBuildVector( 5755 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 5756 .getReg(0)); 5757 ++I; 5758 } 5759 } 5760 } 5761 } 5762 5763 /// Convert from separate vaddr components to a single vector address register, 5764 /// and replace the remaining operands with $noreg. 5765 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 5766 int DimIdx, int NumVAddrs) { 5767 const LLT S32 = LLT::scalar(32); 5768 (void)S32; 5769 SmallVector<Register, 8> AddrRegs; 5770 for (int I = 0; I != NumVAddrs; ++I) { 5771 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 5772 if (SrcOp.isReg()) { 5773 AddrRegs.push_back(SrcOp.getReg()); 5774 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 5775 } 5776 } 5777 5778 int NumAddrRegs = AddrRegs.size(); 5779 if (NumAddrRegs != 1) { 5780 auto VAddr = 5781 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 5782 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 5783 } 5784 5785 for (int I = 1; I != NumVAddrs; ++I) { 5786 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 5787 if (SrcOp.isReg()) 5788 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 5789 } 5790 } 5791 5792 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 5793 /// 5794 /// Depending on the subtarget, load/store with 16-bit element data need to be 5795 /// rewritten to use the low half of 32-bit registers, or directly use a packed 5796 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 5797 /// registers. 5798 /// 5799 /// We don't want to directly select image instructions just yet, but also want 5800 /// to exposes all register repacking to the legalizer/combiners. We also don't 5801 /// want a selected instruction entering RegBankSelect. In order to avoid 5802 /// defining a multitude of intermediate image instructions, directly hack on 5803 /// the intrinsic's arguments. In cases like a16 addresses, this requires 5804 /// padding now unnecessary arguments with $noreg. 5805 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 5806 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 5807 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 5808 5809 const MachineFunction &MF = *MI.getMF(); 5810 const unsigned NumDefs = MI.getNumExplicitDefs(); 5811 const unsigned ArgOffset = NumDefs + 1; 5812 bool IsTFE = NumDefs == 2; 5813 // We are only processing the operands of d16 image operations on subtargets 5814 // that use the unpacked register layout, or need to repack the TFE result. 5815 5816 // TODO: Do we need to guard against already legalized intrinsics? 5817 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 5818 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 5819 5820 MachineRegisterInfo *MRI = B.getMRI(); 5821 const LLT S32 = LLT::scalar(32); 5822 const LLT S16 = LLT::scalar(16); 5823 const LLT V2S16 = LLT::fixed_vector(2, 16); 5824 5825 unsigned DMask = 0; 5826 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 5827 LLT Ty = MRI->getType(VData); 5828 5829 // Check for 16 bit addresses and pack if true. 5830 LLT GradTy = 5831 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 5832 LLT AddrTy = 5833 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 5834 const bool IsG16 = 5835 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 5836 const bool IsA16 = AddrTy == S16; 5837 const bool IsD16 = Ty.getScalarType() == S16; 5838 5839 int DMaskLanes = 0; 5840 if (!BaseOpcode->Atomic) { 5841 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 5842 if (BaseOpcode->Gather4) { 5843 DMaskLanes = 4; 5844 } else if (DMask != 0) { 5845 DMaskLanes = llvm::popcount(DMask); 5846 } else if (!IsTFE && !BaseOpcode->Store) { 5847 // If dmask is 0, this is a no-op load. This can be eliminated. 5848 B.buildUndef(MI.getOperand(0)); 5849 MI.eraseFromParent(); 5850 return true; 5851 } 5852 } 5853 5854 Observer.changingInstr(MI); 5855 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 5856 5857 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 5858 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 5859 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 5860 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 5861 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 5862 5863 // Track that we legalized this 5864 MI.setDesc(B.getTII().get(NewOpcode)); 5865 5866 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 5867 // dmask to be at least 1 otherwise the instruction will fail 5868 if (IsTFE && DMask == 0) { 5869 DMask = 0x1; 5870 DMaskLanes = 1; 5871 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 5872 } 5873 5874 if (BaseOpcode->Atomic) { 5875 Register VData0 = MI.getOperand(2).getReg(); 5876 LLT Ty = MRI->getType(VData0); 5877 5878 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 5879 if (Ty.isVector()) 5880 return false; 5881 5882 if (BaseOpcode->AtomicX2) { 5883 Register VData1 = MI.getOperand(3).getReg(); 5884 // The two values are packed in one register. 5885 LLT PackedTy = LLT::fixed_vector(2, Ty); 5886 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 5887 MI.getOperand(2).setReg(Concat.getReg(0)); 5888 MI.getOperand(3).setReg(AMDGPU::NoRegister); 5889 } 5890 } 5891 5892 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 5893 5894 // Rewrite the addressing register layout before doing anything else. 5895 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 5896 // 16 bit gradients are supported, but are tied to the A16 control 5897 // so both gradients and addresses must be 16 bit 5898 return false; 5899 } 5900 5901 if (IsA16 && !ST.hasA16()) { 5902 // A16 not supported 5903 return false; 5904 } 5905 5906 const unsigned NSAMaxSize = ST.getNSAMaxSize(); 5907 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 5908 5909 if (IsA16 || IsG16) { 5910 if (Intr->NumVAddrs > 1) { 5911 SmallVector<Register, 4> PackedRegs; 5912 5913 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 5914 IsG16); 5915 5916 // See also below in the non-a16 branch 5917 const bool UseNSA = ST.hasNSAEncoding() && 5918 PackedRegs.size() >= ST.getNSAThreshold(MF) && 5919 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 5920 const bool UsePartialNSA = 5921 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 5922 5923 if (UsePartialNSA) { 5924 // Pack registers that would go over NSAMaxSize into last VAddr register 5925 LLT PackedAddrTy = 5926 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 5927 auto Concat = B.buildConcatVectors( 5928 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 5929 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 5930 PackedRegs.resize(NSAMaxSize); 5931 } else if (!UseNSA && PackedRegs.size() > 1) { 5932 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 5933 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 5934 PackedRegs[0] = Concat.getReg(0); 5935 PackedRegs.resize(1); 5936 } 5937 5938 const unsigned NumPacked = PackedRegs.size(); 5939 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 5940 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 5941 if (!SrcOp.isReg()) { 5942 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 5943 continue; 5944 } 5945 5946 assert(SrcOp.getReg() != AMDGPU::NoRegister); 5947 5948 if (I - Intr->VAddrStart < NumPacked) 5949 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 5950 else 5951 SrcOp.setReg(AMDGPU::NoRegister); 5952 } 5953 } 5954 } else { 5955 // If the register allocator cannot place the address registers contiguously 5956 // without introducing moves, then using the non-sequential address encoding 5957 // is always preferable, since it saves VALU instructions and is usually a 5958 // wash in terms of code size or even better. 5959 // 5960 // However, we currently have no way of hinting to the register allocator 5961 // that MIMG addresses should be placed contiguously when it is possible to 5962 // do so, so force non-NSA for the common 2-address case as a heuristic. 5963 // 5964 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 5965 // allocation when possible. 5966 // 5967 // Partial NSA is allowed on GFX11 where the final register is a contiguous 5968 // set of the remaining addresses. 5969 const bool UseNSA = ST.hasNSAEncoding() && 5970 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 5971 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 5972 const bool UsePartialNSA = 5973 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 5974 5975 if (UsePartialNSA) { 5976 convertImageAddrToPacked(B, MI, 5977 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 5978 Intr->NumVAddrs - NSAMaxSize + 1); 5979 } else if (!UseNSA && Intr->NumVAddrs > 1) { 5980 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5981 Intr->NumVAddrs); 5982 } 5983 } 5984 5985 int Flags = 0; 5986 if (IsA16) 5987 Flags |= 1; 5988 if (IsG16) 5989 Flags |= 2; 5990 MI.addOperand(MachineOperand::CreateImm(Flags)); 5991 5992 if (BaseOpcode->Store) { // No TFE for stores? 5993 // TODO: Handle dmask trim 5994 if (!Ty.isVector() || !IsD16) 5995 return true; 5996 5997 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 5998 if (RepackedReg != VData) { 5999 MI.getOperand(1).setReg(RepackedReg); 6000 } 6001 6002 return true; 6003 } 6004 6005 Register DstReg = MI.getOperand(0).getReg(); 6006 const LLT EltTy = Ty.getScalarType(); 6007 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6008 6009 // Confirm that the return type is large enough for the dmask specified 6010 if (NumElts < DMaskLanes) 6011 return false; 6012 6013 if (NumElts > 4 || DMaskLanes > 4) 6014 return false; 6015 6016 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6017 const LLT AdjustedTy = 6018 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6019 6020 // The raw dword aligned data component of the load. The only legal cases 6021 // where this matters should be when using the packed D16 format, for 6022 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6023 LLT RoundedTy; 6024 6025 // S32 vector to cover all data, plus TFE result element. 6026 LLT TFETy; 6027 6028 // Register type to use for each loaded component. Will be S32 or V2S16. 6029 LLT RegTy; 6030 6031 if (IsD16 && ST.hasUnpackedD16VMem()) { 6032 RoundedTy = 6033 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6034 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6035 RegTy = S32; 6036 } else { 6037 unsigned EltSize = EltTy.getSizeInBits(); 6038 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6039 unsigned RoundedSize = 32 * RoundedElts; 6040 RoundedTy = LLT::scalarOrVector( 6041 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6042 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6043 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6044 } 6045 6046 // The return type does not need adjustment. 6047 // TODO: Should we change s16 case to s32 or <2 x s16>? 6048 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6049 return true; 6050 6051 Register Dst1Reg; 6052 6053 // Insert after the instruction. 6054 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6055 6056 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6057 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6058 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6059 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6060 6061 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6062 6063 MI.getOperand(0).setReg(NewResultReg); 6064 6065 // In the IR, TFE is supposed to be used with a 2 element struct return 6066 // type. The instruction really returns these two values in one contiguous 6067 // register, with one additional dword beyond the loaded data. Rewrite the 6068 // return type to use a single register result. 6069 6070 if (IsTFE) { 6071 Dst1Reg = MI.getOperand(1).getReg(); 6072 if (MRI->getType(Dst1Reg) != S32) 6073 return false; 6074 6075 // TODO: Make sure the TFE operand bit is set. 6076 MI.removeOperand(1); 6077 6078 // Handle the easy case that requires no repack instructions. 6079 if (Ty == S32) { 6080 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6081 return true; 6082 } 6083 } 6084 6085 // Now figure out how to copy the new result register back into the old 6086 // result. 6087 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6088 6089 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6090 6091 if (ResultNumRegs == 1) { 6092 assert(!IsTFE); 6093 ResultRegs[0] = NewResultReg; 6094 } else { 6095 // We have to repack into a new vector of some kind. 6096 for (int I = 0; I != NumDataRegs; ++I) 6097 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6098 B.buildUnmerge(ResultRegs, NewResultReg); 6099 6100 // Drop the final TFE element to get the data part. The TFE result is 6101 // directly written to the right place already. 6102 if (IsTFE) 6103 ResultRegs.resize(NumDataRegs); 6104 } 6105 6106 // For an s16 scalar result, we form an s32 result with a truncate regardless 6107 // of packed vs. unpacked. 6108 if (IsD16 && !Ty.isVector()) { 6109 B.buildTrunc(DstReg, ResultRegs[0]); 6110 return true; 6111 } 6112 6113 // Avoid a build/concat_vector of 1 entry. 6114 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6115 B.buildBitcast(DstReg, ResultRegs[0]); 6116 return true; 6117 } 6118 6119 assert(Ty.isVector()); 6120 6121 if (IsD16) { 6122 // For packed D16 results with TFE enabled, all the data components are 6123 // S32. Cast back to the expected type. 6124 // 6125 // TODO: We don't really need to use load s32 elements. We would only need one 6126 // cast for the TFE result if a multiple of v2s16 was used. 6127 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6128 for (Register &Reg : ResultRegs) 6129 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6130 } else if (ST.hasUnpackedD16VMem()) { 6131 for (Register &Reg : ResultRegs) 6132 Reg = B.buildTrunc(S16, Reg).getReg(0); 6133 } 6134 } 6135 6136 auto padWithUndef = [&](LLT Ty, int NumElts) { 6137 if (NumElts == 0) 6138 return; 6139 Register Undef = B.buildUndef(Ty).getReg(0); 6140 for (int I = 0; I != NumElts; ++I) 6141 ResultRegs.push_back(Undef); 6142 }; 6143 6144 // Pad out any elements eliminated due to the dmask. 6145 LLT ResTy = MRI->getType(ResultRegs[0]); 6146 if (!ResTy.isVector()) { 6147 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6148 B.buildBuildVector(DstReg, ResultRegs); 6149 return true; 6150 } 6151 6152 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6153 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6154 6155 // Deal with the one annoying legal case. 6156 const LLT V3S16 = LLT::fixed_vector(3, 16); 6157 if (Ty == V3S16) { 6158 if (IsTFE) { 6159 if (ResultRegs.size() == 1) { 6160 NewResultReg = ResultRegs[0]; 6161 } else if (ResultRegs.size() == 2) { 6162 LLT V4S16 = LLT::fixed_vector(4, 16); 6163 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6164 } else { 6165 return false; 6166 } 6167 } 6168 6169 if (MRI->getType(DstReg).getNumElements() < 6170 MRI->getType(NewResultReg).getNumElements()) { 6171 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6172 } else { 6173 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6174 } 6175 return true; 6176 } 6177 6178 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6179 B.buildConcatVectors(DstReg, ResultRegs); 6180 return true; 6181 } 6182 6183 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 6184 LegalizerHelper &Helper, MachineInstr &MI) const { 6185 MachineIRBuilder &B = Helper.MIRBuilder; 6186 GISelChangeObserver &Observer = Helper.Observer; 6187 6188 Register Dst = MI.getOperand(0).getReg(); 6189 LLT Ty = B.getMRI()->getType(Dst); 6190 unsigned Size = Ty.getSizeInBits(); 6191 MachineFunction &MF = B.getMF(); 6192 6193 Observer.changingInstr(MI); 6194 6195 // Handle needing to s.buffer.load() a p8 value. 6196 if (hasBufferRsrcWorkaround(Ty)) { 6197 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6198 Dst = MI.getOperand(0).getReg(); 6199 B.setInsertPt(B.getMBB(), MI); 6200 } 6201 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6202 Ty = getBitcastRegisterType(Ty); 6203 Helper.bitcastDst(MI, Ty, 0); 6204 Dst = MI.getOperand(0).getReg(); 6205 B.setInsertPt(B.getMBB(), MI); 6206 } 6207 6208 // FIXME: We don't really need this intermediate instruction. The intrinsic 6209 // should be fixed to have a memory operand. Since it's readnone, we're not 6210 // allowed to add one. 6211 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 6212 MI.removeOperand(1); // Remove intrinsic ID 6213 6214 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6215 // TODO: Should this use datalayout alignment? 6216 const unsigned MemSize = (Size + 7) / 8; 6217 const Align MemAlign(4); 6218 MachineMemOperand *MMO = MF.getMachineMemOperand( 6219 MachinePointerInfo(), 6220 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6221 MachineMemOperand::MOInvariant, 6222 MemSize, MemAlign); 6223 MI.addMemOperand(MF, MMO); 6224 6225 // There are no 96-bit result scalar loads, but widening to 128-bit should 6226 // always be legal. We may need to restore this to a 96-bit result if it turns 6227 // out this needs to be converted to a vector load during RegBankSelect. 6228 if (!isPowerOf2_32(Size)) { 6229 if (Ty.isVector()) 6230 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6231 else 6232 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6233 } 6234 6235 Observer.changedInstr(MI); 6236 return true; 6237 } 6238 6239 // TODO: Move to selection 6240 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 6241 MachineRegisterInfo &MRI, 6242 MachineIRBuilder &B) const { 6243 if (!ST.isTrapHandlerEnabled() || 6244 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6245 return legalizeTrapEndpgm(MI, MRI, B); 6246 6247 const Module *M = B.getMF().getFunction().getParent(); 6248 unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); 6249 if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) 6250 return legalizeTrapHsaQueuePtr(MI, MRI, B); 6251 6252 return ST.supportsGetDoorbellID() ? 6253 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6254 } 6255 6256 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6257 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6258 const DebugLoc &DL = MI.getDebugLoc(); 6259 MachineBasicBlock &BB = B.getMBB(); 6260 MachineFunction *MF = BB.getParent(); 6261 6262 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6263 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6264 .addImm(0); 6265 MI.eraseFromParent(); 6266 return true; 6267 } 6268 6269 // We need a block split to make the real endpgm a terminator. We also don't 6270 // want to break phis in successor blocks, so we can't just delete to the 6271 // end of the block. 6272 BB.splitAt(MI, false /*UpdateLiveIns*/); 6273 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6274 MF->push_back(TrapBB); 6275 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6276 .addImm(0); 6277 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6278 .addMBB(TrapBB); 6279 6280 BB.addSuccessor(TrapBB); 6281 MI.eraseFromParent(); 6282 return true; 6283 } 6284 6285 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6286 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6287 MachineFunction &MF = B.getMF(); 6288 const LLT S64 = LLT::scalar(64); 6289 6290 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6291 // For code object version 5, queue_ptr is passed through implicit kernarg. 6292 if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 6293 AMDGPU::AMDHSA_COV5) { 6294 AMDGPUTargetLowering::ImplicitParameter Param = 6295 AMDGPUTargetLowering::QUEUE_PTR; 6296 uint64_t Offset = 6297 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6298 6299 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6300 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6301 6302 if (!loadInputValue(KernargPtrReg, B, 6303 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6304 return false; 6305 6306 // TODO: can we be smarter about machine pointer info? 6307 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6308 MachineMemOperand *MMO = MF.getMachineMemOperand( 6309 PtrInfo, 6310 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6311 MachineMemOperand::MOInvariant, 6312 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6313 6314 // Pointer address 6315 Register LoadAddr = MRI.createGenericVirtualRegister( 6316 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6317 B.buildPtrAdd(LoadAddr, KernargPtrReg, 6318 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 6319 // Load address 6320 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 6321 B.buildCopy(SGPR01, Temp); 6322 B.buildInstr(AMDGPU::S_TRAP) 6323 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6324 .addReg(SGPR01, RegState::Implicit); 6325 MI.eraseFromParent(); 6326 return true; 6327 } 6328 6329 // Pass queue pointer to trap handler as input, and insert trap instruction 6330 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6331 Register LiveIn = 6332 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6333 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 6334 return false; 6335 6336 B.buildCopy(SGPR01, LiveIn); 6337 B.buildInstr(AMDGPU::S_TRAP) 6338 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 6339 .addReg(SGPR01, RegState::Implicit); 6340 6341 MI.eraseFromParent(); 6342 return true; 6343 } 6344 6345 bool AMDGPULegalizerInfo::legalizeTrapHsa( 6346 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6347 B.buildInstr(AMDGPU::S_TRAP) 6348 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 6349 MI.eraseFromParent(); 6350 return true; 6351 } 6352 6353 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 6354 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6355 // Is non-HSA path or trap-handler disabled? Then, report a warning 6356 // accordingly 6357 if (!ST.isTrapHandlerEnabled() || 6358 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 6359 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 6360 "debugtrap handler not supported", 6361 MI.getDebugLoc(), DS_Warning); 6362 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 6363 Ctx.diagnose(NoTrap); 6364 } else { 6365 // Insert debug-trap instruction 6366 B.buildInstr(AMDGPU::S_TRAP) 6367 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 6368 } 6369 6370 MI.eraseFromParent(); 6371 return true; 6372 } 6373 6374 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6375 MachineIRBuilder &B) const { 6376 MachineRegisterInfo &MRI = *B.getMRI(); 6377 const LLT S16 = LLT::scalar(16); 6378 const LLT S32 = LLT::scalar(32); 6379 const LLT V2S16 = LLT::fixed_vector(2, 16); 6380 const LLT V3S32 = LLT::fixed_vector(3, 32); 6381 6382 Register DstReg = MI.getOperand(0).getReg(); 6383 Register NodePtr = MI.getOperand(2).getReg(); 6384 Register RayExtent = MI.getOperand(3).getReg(); 6385 Register RayOrigin = MI.getOperand(4).getReg(); 6386 Register RayDir = MI.getOperand(5).getReg(); 6387 Register RayInvDir = MI.getOperand(6).getReg(); 6388 Register TDescr = MI.getOperand(7).getReg(); 6389 6390 if (!ST.hasGFX10_AEncoding()) { 6391 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6392 "intrinsic not supported on subtarget", 6393 MI.getDebugLoc()); 6394 B.getMF().getFunction().getContext().diagnose(BadIntrin); 6395 return false; 6396 } 6397 6398 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6399 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6400 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6401 const unsigned NumVDataDwords = 4; 6402 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 6403 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 6404 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 6405 const unsigned BaseOpcodes[2][2] = { 6406 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6407 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6408 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6409 int Opcode; 6410 if (UseNSA) { 6411 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 6412 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 6413 : AMDGPU::MIMGEncGfx10NSA, 6414 NumVDataDwords, NumVAddrDwords); 6415 } else { 6416 Opcode = AMDGPU::getMIMGOpcode( 6417 BaseOpcodes[Is64][IsA16], 6418 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 6419 NumVDataDwords, NumVAddrDwords); 6420 } 6421 assert(Opcode != -1); 6422 6423 SmallVector<Register, 12> Ops; 6424 if (UseNSA && IsGFX11Plus) { 6425 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 6426 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6427 auto Merged = B.buildMergeLikeInstr( 6428 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 6429 Ops.push_back(Merged.getReg(0)); 6430 }; 6431 6432 Ops.push_back(NodePtr); 6433 Ops.push_back(RayExtent); 6434 packLanes(RayOrigin); 6435 6436 if (IsA16) { 6437 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6438 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6439 auto MergedDir = B.buildMergeLikeInstr( 6440 V3S32, 6441 {B.buildBitcast( 6442 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 6443 UnmergeRayDir.getReg(0)})) 6444 .getReg(0), 6445 B.buildBitcast( 6446 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 6447 UnmergeRayDir.getReg(1)})) 6448 .getReg(0), 6449 B.buildBitcast( 6450 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 6451 UnmergeRayDir.getReg(2)})) 6452 .getReg(0)}); 6453 Ops.push_back(MergedDir.getReg(0)); 6454 } else { 6455 packLanes(RayDir); 6456 packLanes(RayInvDir); 6457 } 6458 } else { 6459 if (Is64) { 6460 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6461 Ops.push_back(Unmerge.getReg(0)); 6462 Ops.push_back(Unmerge.getReg(1)); 6463 } else { 6464 Ops.push_back(NodePtr); 6465 } 6466 Ops.push_back(RayExtent); 6467 6468 auto packLanes = [&Ops, &S32, &B](Register Src) { 6469 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6470 Ops.push_back(Unmerge.getReg(0)); 6471 Ops.push_back(Unmerge.getReg(1)); 6472 Ops.push_back(Unmerge.getReg(2)); 6473 }; 6474 6475 packLanes(RayOrigin); 6476 if (IsA16) { 6477 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 6478 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6479 Register R1 = MRI.createGenericVirtualRegister(S32); 6480 Register R2 = MRI.createGenericVirtualRegister(S32); 6481 Register R3 = MRI.createGenericVirtualRegister(S32); 6482 B.buildMergeLikeInstr(R1, 6483 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6484 B.buildMergeLikeInstr( 6485 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6486 B.buildMergeLikeInstr( 6487 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6488 Ops.push_back(R1); 6489 Ops.push_back(R2); 6490 Ops.push_back(R3); 6491 } else { 6492 packLanes(RayDir); 6493 packLanes(RayInvDir); 6494 } 6495 } 6496 6497 if (!UseNSA) { 6498 // Build a single vector containing all the operands so far prepared. 6499 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6500 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6501 Ops.clear(); 6502 Ops.push_back(MergedOps); 6503 } 6504 6505 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6506 .addDef(DstReg) 6507 .addImm(Opcode); 6508 6509 for (Register R : Ops) { 6510 MIB.addUse(R); 6511 } 6512 6513 MIB.addUse(TDescr) 6514 .addImm(IsA16 ? 1 : 0) 6515 .cloneMemRefs(MI); 6516 6517 MI.eraseFromParent(); 6518 return true; 6519 } 6520 6521 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 6522 MachineIRBuilder &B) const { 6523 unsigned Opc; 6524 int RoundMode = MI.getOperand(2).getImm(); 6525 6526 if (RoundMode == (int)RoundingMode::TowardPositive) 6527 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 6528 else if (RoundMode == (int)RoundingMode::TowardNegative) 6529 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 6530 else 6531 return false; 6532 6533 B.buildInstr(Opc) 6534 .addDef(MI.getOperand(0).getReg()) 6535 .addUse(MI.getOperand(1).getReg()); 6536 6537 MI.eraseFromParent(); 6538 6539 return true; 6540 } 6541 6542 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 6543 MachineInstr &MI) const { 6544 MachineIRBuilder &B = Helper.MIRBuilder; 6545 MachineRegisterInfo &MRI = *B.getMRI(); 6546 6547 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 6548 auto IntrID = MI.getIntrinsicID(); 6549 switch (IntrID) { 6550 case Intrinsic::amdgcn_if: 6551 case Intrinsic::amdgcn_else: { 6552 MachineInstr *Br = nullptr; 6553 MachineBasicBlock *UncondBrTarget = nullptr; 6554 bool Negated = false; 6555 if (MachineInstr *BrCond = 6556 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6557 const SIRegisterInfo *TRI 6558 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6559 6560 Register Def = MI.getOperand(1).getReg(); 6561 Register Use = MI.getOperand(3).getReg(); 6562 6563 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6564 6565 if (Negated) 6566 std::swap(CondBrTarget, UncondBrTarget); 6567 6568 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6569 if (IntrID == Intrinsic::amdgcn_if) { 6570 B.buildInstr(AMDGPU::SI_IF) 6571 .addDef(Def) 6572 .addUse(Use) 6573 .addMBB(UncondBrTarget); 6574 } else { 6575 B.buildInstr(AMDGPU::SI_ELSE) 6576 .addDef(Def) 6577 .addUse(Use) 6578 .addMBB(UncondBrTarget); 6579 } 6580 6581 if (Br) { 6582 Br->getOperand(0).setMBB(CondBrTarget); 6583 } else { 6584 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 6585 // since we're swapping branch targets it needs to be reinserted. 6586 // FIXME: IRTranslator should probably not do this 6587 B.buildBr(*CondBrTarget); 6588 } 6589 6590 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 6591 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 6592 MI.eraseFromParent(); 6593 BrCond->eraseFromParent(); 6594 return true; 6595 } 6596 6597 return false; 6598 } 6599 case Intrinsic::amdgcn_loop: { 6600 MachineInstr *Br = nullptr; 6601 MachineBasicBlock *UncondBrTarget = nullptr; 6602 bool Negated = false; 6603 if (MachineInstr *BrCond = 6604 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 6605 const SIRegisterInfo *TRI 6606 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 6607 6608 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6609 Register Reg = MI.getOperand(2).getReg(); 6610 6611 if (Negated) 6612 std::swap(CondBrTarget, UncondBrTarget); 6613 6614 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6615 B.buildInstr(AMDGPU::SI_LOOP) 6616 .addUse(Reg) 6617 .addMBB(UncondBrTarget); 6618 6619 if (Br) 6620 Br->getOperand(0).setMBB(CondBrTarget); 6621 else 6622 B.buildBr(*CondBrTarget); 6623 6624 MI.eraseFromParent(); 6625 BrCond->eraseFromParent(); 6626 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 6627 return true; 6628 } 6629 6630 return false; 6631 } 6632 case Intrinsic::amdgcn_make_buffer_rsrc: 6633 return legalizePointerAsRsrcIntrin(MI, MRI, B); 6634 case Intrinsic::amdgcn_kernarg_segment_ptr: 6635 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 6636 // This only makes sense to call in a kernel, so just lower to null. 6637 B.buildConstant(MI.getOperand(0).getReg(), 0); 6638 MI.eraseFromParent(); 6639 return true; 6640 } 6641 6642 return legalizePreloadedArgIntrin( 6643 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 6644 case Intrinsic::amdgcn_implicitarg_ptr: 6645 return legalizeImplicitArgPtr(MI, MRI, B); 6646 case Intrinsic::amdgcn_workitem_id_x: 6647 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 6648 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 6649 case Intrinsic::amdgcn_workitem_id_y: 6650 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 6651 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 6652 case Intrinsic::amdgcn_workitem_id_z: 6653 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 6654 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 6655 case Intrinsic::amdgcn_workgroup_id_x: 6656 return legalizePreloadedArgIntrin(MI, MRI, B, 6657 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 6658 case Intrinsic::amdgcn_workgroup_id_y: 6659 return legalizePreloadedArgIntrin(MI, MRI, B, 6660 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 6661 case Intrinsic::amdgcn_workgroup_id_z: 6662 return legalizePreloadedArgIntrin(MI, MRI, B, 6663 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 6664 case Intrinsic::amdgcn_lds_kernel_id: 6665 return legalizePreloadedArgIntrin(MI, MRI, B, 6666 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 6667 case Intrinsic::amdgcn_dispatch_ptr: 6668 return legalizePreloadedArgIntrin(MI, MRI, B, 6669 AMDGPUFunctionArgInfo::DISPATCH_PTR); 6670 case Intrinsic::amdgcn_queue_ptr: 6671 return legalizePreloadedArgIntrin(MI, MRI, B, 6672 AMDGPUFunctionArgInfo::QUEUE_PTR); 6673 case Intrinsic::amdgcn_implicit_buffer_ptr: 6674 return legalizePreloadedArgIntrin( 6675 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 6676 case Intrinsic::amdgcn_dispatch_id: 6677 return legalizePreloadedArgIntrin(MI, MRI, B, 6678 AMDGPUFunctionArgInfo::DISPATCH_ID); 6679 case Intrinsic::r600_read_ngroups_x: 6680 // TODO: Emit error for hsa 6681 return legalizeKernargMemParameter(MI, B, 6682 SI::KernelInputOffsets::NGROUPS_X); 6683 case Intrinsic::r600_read_ngroups_y: 6684 return legalizeKernargMemParameter(MI, B, 6685 SI::KernelInputOffsets::NGROUPS_Y); 6686 case Intrinsic::r600_read_ngroups_z: 6687 return legalizeKernargMemParameter(MI, B, 6688 SI::KernelInputOffsets::NGROUPS_Z); 6689 case Intrinsic::r600_read_local_size_x: 6690 // TODO: Could insert G_ASSERT_ZEXT from s16 6691 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 6692 case Intrinsic::r600_read_local_size_y: 6693 // TODO: Could insert G_ASSERT_ZEXT from s16 6694 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 6695 // TODO: Could insert G_ASSERT_ZEXT from s16 6696 case Intrinsic::r600_read_local_size_z: 6697 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 6698 case Intrinsic::r600_read_global_size_x: 6699 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 6700 case Intrinsic::r600_read_global_size_y: 6701 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 6702 case Intrinsic::r600_read_global_size_z: 6703 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 6704 case Intrinsic::amdgcn_fdiv_fast: 6705 return legalizeFDIVFastIntrin(MI, MRI, B); 6706 case Intrinsic::amdgcn_is_shared: 6707 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 6708 case Intrinsic::amdgcn_is_private: 6709 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 6710 case Intrinsic::amdgcn_wavefrontsize: { 6711 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 6712 MI.eraseFromParent(); 6713 return true; 6714 } 6715 case Intrinsic::amdgcn_s_buffer_load: 6716 return legalizeSBufferLoad(Helper, MI); 6717 case Intrinsic::amdgcn_raw_buffer_store: 6718 case Intrinsic::amdgcn_raw_ptr_buffer_store: 6719 case Intrinsic::amdgcn_struct_buffer_store: 6720 case Intrinsic::amdgcn_struct_ptr_buffer_store: 6721 return legalizeBufferStore(MI, MRI, B, false, false); 6722 case Intrinsic::amdgcn_raw_buffer_store_format: 6723 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 6724 case Intrinsic::amdgcn_struct_buffer_store_format: 6725 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 6726 return legalizeBufferStore(MI, MRI, B, false, true); 6727 case Intrinsic::amdgcn_raw_tbuffer_store: 6728 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 6729 case Intrinsic::amdgcn_struct_tbuffer_store: 6730 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 6731 return legalizeBufferStore(MI, MRI, B, true, true); 6732 case Intrinsic::amdgcn_raw_buffer_load: 6733 case Intrinsic::amdgcn_raw_ptr_buffer_load: 6734 case Intrinsic::amdgcn_struct_buffer_load: 6735 case Intrinsic::amdgcn_struct_ptr_buffer_load: 6736 return legalizeBufferLoad(MI, MRI, B, false, false); 6737 case Intrinsic::amdgcn_raw_buffer_load_format: 6738 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 6739 case Intrinsic::amdgcn_struct_buffer_load_format: 6740 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 6741 return legalizeBufferLoad(MI, MRI, B, true, false); 6742 case Intrinsic::amdgcn_raw_tbuffer_load: 6743 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 6744 case Intrinsic::amdgcn_struct_tbuffer_load: 6745 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 6746 return legalizeBufferLoad(MI, MRI, B, true, true); 6747 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 6748 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 6749 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 6750 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 6751 case Intrinsic::amdgcn_raw_buffer_atomic_add: 6752 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 6753 case Intrinsic::amdgcn_struct_buffer_atomic_add: 6754 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 6755 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 6756 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 6757 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 6758 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 6759 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 6760 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 6761 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 6762 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 6763 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 6764 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 6765 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 6766 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 6767 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 6768 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 6769 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 6770 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 6771 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 6772 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 6773 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 6774 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 6775 case Intrinsic::amdgcn_raw_buffer_atomic_and: 6776 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 6777 case Intrinsic::amdgcn_struct_buffer_atomic_and: 6778 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 6779 case Intrinsic::amdgcn_raw_buffer_atomic_or: 6780 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 6781 case Intrinsic::amdgcn_struct_buffer_atomic_or: 6782 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 6783 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 6784 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 6785 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 6786 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 6787 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 6788 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 6789 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 6790 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 6791 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 6792 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 6793 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 6794 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 6795 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 6796 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 6797 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 6798 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6799 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 6800 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6801 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 6802 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6803 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 6804 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6805 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 6806 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 6807 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 6808 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6809 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 6810 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 6811 return legalizeBufferAtomic(MI, B, IntrID); 6812 case Intrinsic::trap: 6813 return legalizeTrapIntrinsic(MI, MRI, B); 6814 case Intrinsic::debugtrap: 6815 return legalizeDebugTrapIntrinsic(MI, MRI, B); 6816 case Intrinsic::amdgcn_rsq_clamp: 6817 return legalizeRsqClampIntrinsic(MI, MRI, B); 6818 case Intrinsic::amdgcn_ds_fadd: 6819 case Intrinsic::amdgcn_ds_fmin: 6820 case Intrinsic::amdgcn_ds_fmax: 6821 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 6822 case Intrinsic::amdgcn_image_bvh_intersect_ray: 6823 return legalizeBVHIntrinsic(MI, B); 6824 case Intrinsic::amdgcn_fmed3: { 6825 GISelChangeObserver &Observer = Helper.Observer; 6826 6827 // FIXME: This is to workaround the inability of tablegen match combiners to 6828 // match intrinsics in patterns. 6829 Observer.changingInstr(MI); 6830 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 6831 MI.removeOperand(1); 6832 Observer.changedInstr(MI); 6833 return true; 6834 } 6835 default: { 6836 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 6837 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 6838 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 6839 return true; 6840 } 6841 } 6842 6843 return true; 6844 } 6845